In [126]:
import pandas as pd
import numpy as np
import catboost as cb

In [127]:
def get_next_month(date: str) -> str:
    return str(pd.Period(date, freq="M") + 1)



## Cleanup the data

In [128]:
df = pd.read_csv('task_g/training.csv')

In [129]:
df

Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,wins_pvp,...,current_avg_ping,current_month,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,log_revenue_current_month,log_ads_revenue_current_month
0,1,13,81,paid,iOS,100,21.373946,23.33529,265,205,...,0.275935,2024-07,0.0,0.0,0.0,0.0,False,1,3.191928,3.107897
1,1,1,81,paid,iOS,1,0.867439,8.87251,18,7,...,0.263144,2024-09,0.0,0.0,0.0,0.0,False,3,2.289754,0.624568
2,2,1,193,paid,Android,0,0.000000,0.00000,0,0,...,0.000000,2024-10,0.0,0.0,0.0,0.0,False,4,0.000000,0.000000
3,3,7,180,paid,Android,0,0.000000,0.00000,26,20,...,0.430013,2024-12,0.0,0.0,0.0,0.0,True,0,0.000000,0.000000
4,3,2,180,paid,Android,0,0.000000,0.00000,1,0,...,0.374937,2025-01,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196877,133672,2,173,paid,Android,0,0.000000,0.00000,9,2,...,0.223729,2025-01,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
196878,133673,14,49,paid,Android,32,0.042859,0.00000,161,90,...,0.359385,2024-10,0.0,0.0,0.0,0.0,False,0,0.000000,0.041966
196879,133674,1,208,paid,Android,0,0.000000,0.00000,0,0,...,0.000000,2025-01,0.0,0.0,0.0,0.0,False,8,0.000000,0.000000
196880,133674,2,208,paid,Android,2,0.137498,0.00000,5,1,...,0.151158,2024-07,0.0,0.0,0.0,0.0,False,2,0.000000,0.128831


In [130]:
# training `ads_revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_ads_revenue_next_month"]

features_names = X.columns.tolist()
cat_features_names = ['country', 'traffic_type', 'platform']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=features_names)
val_pool   = cb.Pool(X_val, y_val, cat_features=cat_features_names, feature_names=features_names)

model_ads_revenue_next_month = cb.CatBoostRegressor(
    iterations=2500,
    learning_rate=0.01,
    loss_function='RMSE',
    subsample=0.8,
    l2_leaf_reg=2,
)

model_ads_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4513387	test: 0.4479678	best: 0.4479678 (0)	total: 10.3ms	remaining: 25.7s
100:	learn: 0.3101217	test: 0.3087410	best: 0.3087410 (100)	total: 858ms	remaining: 20.4s
200:	learn: 0.2794441	test: 0.2794698	best: 0.2794698 (200)	total: 1.71s	remaining: 19.6s
300:	learn: 0.2718542	test: 0.2727575	best: 0.2727575 (300)	total: 2.59s	remaining: 18.9s
400:	learn: 0.2688656	test: 0.2706029	best: 0.2706029 (400)	total: 3.53s	remaining: 18.5s
500:	learn: 0.2672605	test: 0.2696026	best: 0.2696026 (500)	total: 4.38s	remaining: 17.5s
600:	learn: 0.2659363	test: 0.2690028	best: 0.2690028 (600)	total: 5.25s	remaining: 16.6s
700:	learn: 0.2648026	test: 0.2685014	best: 0.2685014 (700)	total: 6.09s	remaining: 15.6s
800:	learn: 0.2637545	test: 0.2680939	best: 0.2680939 (800)	total: 6.95s	remaining: 14.7s
900:	learn: 0.2627781	test: 0.2677130	best: 0.2677130 (900)	total: 7.75s	remaining: 13.8s
1000:	learn: 0.2618477	test: 0.2674444	best: 0.2674444 (1000)	total: 8.62s	remaining: 12.9s
1100:	learn

<catboost.core.CatBoostRegressor at 0x70d090133bc0>

In [131]:
imp = list(zip(model_ads_revenue_next_month.feature_names_, model_ads_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('currency_1', 21.227223603636183),
 ('ads_revenue_current_month', 18.963764351322997),
 ('log_ads_revenue_current_month', 17.655561261082152),
 ('currency_2', 13.340828739215631),
 ('current_passed_level', 6.313261601819904),
 ('logins_current_month', 5.017475922035451),
 ('months_after_reg', 2.0616897032596495),
 ('revenue_current_month', 1.759633695228549),
 ('log_revenue_current_month', 1.6176948186853328),
 ('quests', 1.6116801746912823),
 ('offers', 1.5911281999279936),
 ('ads_shown_current_month', 1.5292010158495577),
 ('current_avg_ping', 1.277188709683949),
 ('games_pvp', 1.0291753924981557),
 ('country', 0.8577088200163947),
 ('currency_6', 0.6861401010725715),
 ('wins_pvp', 0.6714812153848704),
 ('hard_quests', 0.6159135601984752),
 ('currency_3', 0.5457192764361145),
 ('currency_4', 0.5423304838023599),
 ('currency_7', 0.4372793062735931),
 ('platform', 0.28399938622332893),
 ('currency_5', 0.2808902054789996),
 ('traffic_type', 0.08303045617646486)]

In [132]:
# trainig `revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_revenue_next_month"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_revenue_next_month = cb.CatBoostRegressor(
    iterations=5000,
    learning_rate=0.002,
    loss_function='RMSE',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6131416	test: 0.6145268	best: 0.6145268 (0)	total: 11.7ms	remaining: 58.3s
100:	learn: 0.5596086	test: 0.5615131	best: 0.5615131 (100)	total: 934ms	remaining: 45.3s
200:	learn: 0.5199589	test: 0.5223891	best: 0.5223891 (200)	total: 1.93s	remaining: 46s
300:	learn: 0.4910039	test: 0.4938770	best: 0.4938770 (300)	total: 2.89s	remaining: 45.1s
400:	learn: 0.4700923	test: 0.4734019	best: 0.4734019 (400)	total: 3.83s	remaining: 43.9s
500:	learn: 0.4550742	test: 0.4587205	best: 0.4587205 (500)	total: 4.79s	remaining: 43s
600:	learn: 0.4443285	test: 0.4483535	best: 0.4483535 (600)	total: 5.71s	remaining: 41.8s
700:	learn: 0.4365828	test: 0.4409549	best: 0.4409549 (700)	total: 6.64s	remaining: 40.7s
800:	learn: 0.4309755	test: 0.4356985	best: 0.4356985 (800)	total: 7.61s	remaining: 39.9s
900:	learn: 0.4268871	test: 0.4319829	best: 0.4319829 (900)	total: 8.59s	remaining: 39.1s
1000:	learn: 0.4238453	test: 0.4293160	best: 0.4293160 (1000)	total: 9.55s	remaining: 38.2s
1100:	learn: 0.

<catboost.core.CatBoostRegressor at 0x70cff65eecc0>

In [133]:
imp = list(zip(model_revenue_next_month.feature_names_, model_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('revenue_current_month', 23.878715746105172),
 ('log_revenue_current_month', 23.416591150420544),
 ('currency_1', 14.396971101610175),
 ('currency_2', 11.600292680165202),
 ('current_passed_level', 5.267665955864428),
 ('offers', 3.492608618445363),
 ('log_ads_revenue_current_month', 2.1911016774725742),
 ('ads_revenue_current_month', 2.0686998427434573),
 ('logins_current_month', 1.9326035835040865),
 ('months_after_reg', 1.7996935809384174),
 ('country', 1.3829839521083358),
 ('ads_shown_current_month', 1.3663721526258499),
 ('quests', 1.1577697474885573),
 ('current_avg_ping', 0.8977407286131549),
 ('currency_6', 0.8685506528670442),
 ('hard_quests', 0.8285944790668519),
 ('currency_4', 0.7898853076433725),
 ('games_pvp', 0.7852931948172568),
 ('wins_pvp', 0.4906665422181988),
 ('currency_7', 0.4778550553120831),
 ('currency_3', 0.3407542167485042),
 ('platform', 0.29522973399873376),
 ('currency_5', 0.22876094268549318),
 ('traffic_type', 0.044599356537160434)]

In [134]:
# trainig `is_active_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["is_active_next_month"]

params = dict(
    iterations=2000,
    early_stopping_rounds=200,
    learning_rate=0.025,
    loss_function='Logloss',
    subsample=0.9,
    l2_leaf_reg=3,
    eval_metric="AUC",
    auto_class_weights="Balanced"
)

pool = cb.Pool(X, y, cat_features=cat_features_names, feature_names=X.columns.tolist())

stats, models_is_active_next_month = cb.cv(
    pool=pool,
    params=params,
    fold_count=5,
    shuffle=True,
    early_stopping_rounds=200,
    plot=True,
    verbose=False,
    return_models=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.8858860491
bestIteration = 1983

Training on fold [1/5]

bestTest = 0.8867737532
bestIteration = 1759

Training on fold [2/5]

bestTest = 0.8864243565
bestIteration = 1375

Training on fold [3/5]

bestTest = 0.8883153723
bestIteration = 1992

Training on fold [4/5]

bestTest = 0.8870023238
bestIteration = 1991



In [135]:
# imp = list(zip(model_is_active_next_month.feature_names_, model_is_active_next_month.feature_importances_.tolist()))
#
# imp.sort(key=lambda x: x[1], reverse=True)
#
# imp

# Predict

In [136]:
prediction = pd.read_csv('task_g/predict.csv')

In [137]:
prediction

Unnamed: 0,id,current_month,id.1,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,...,current_avg_ping,current_month.1,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,log_revenue_current_month,log_ads_revenue_current_month
0,6,2025-04,6,29,133,organic,Android,66,0.040797,0.000000,...,0.277034,2025-04,0.0,0.0,0.0,0.0,False,3,0.000000,0.039987
1,10,2025-04,10,30,164,paid,iOS,781,26.512756,121.358247,...,0.049059,2025-04,0.0,0.0,0.0,0.0,False,1,4.806953,3.314650
2,11,2025-04,11,7,93,organic,Android,3,0.021389,0.000000,...,0.325849,2025-04,0.0,0.0,0.0,0.0,False,0,0.000000,0.021164
3,18,2025-04,18,9,112,organic,iOS,67,2.316753,5.058957,...,0.396381,2025-04,0.0,0.0,0.0,0.0,False,0,1.801538,1.198986
4,23,2025-04,23,13,67,paid,Android,19,0.145450,0.000000,...,0.209484,2025-04,0.0,0.0,0.0,0.0,False,1,0.000000,0.135797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27245,133642,2025-04,133642,1,101,paid,Android,0,0.000000,0.000000,...,0.195539,2025-04,0.0,0.0,0.0,0.0,False,5,0.000000,0.000000
27246,133644,2025-04,133644,1,161,paid,Android,0,0.000000,0.000000,...,0.652246,2025-04,0.0,0.0,0.0,0.0,False,2,0.000000,0.000000
27247,133651,2025-04,133651,1,8,paid,Android,0,0.000000,0.000000,...,0.000000,2025-04,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
27248,133662,2025-04,133662,2,164,paid,Android,2,0.002376,0.000000,...,0.172545,2025-04,0.0,0.0,0.0,0.0,False,4,0.000000,0.002374


In [138]:
def predict_average(models, input):
    predictions = np.array([ model.predict(input[model.feature_names_], prediction_type="Probability")[:, 1] for model in models ])

    return predictions.mean(axis=0)

prediction['is_active'] = np.int8(predict_average(models_is_active_next_month, prediction) > 0.5)
prediction['revenue_next_month'] = np.expm1(model_revenue_next_month.predict(prediction[model_revenue_next_month.feature_names_]))
prediction['ads_revenue_next_month'] = np.expm1(model_ads_revenue_next_month.predict(prediction[model_ads_revenue_next_month.feature_names_]))

submission = prediction[ ['id', 'is_active', 'revenue_next_month', 'ads_revenue_next_month'] ].copy()

submission

Unnamed: 0,id,is_active,revenue_next_month,ads_revenue_next_month
0,6,1,0.007898,0.053087
1,10,1,44.726969,14.856674
2,11,0,0.005494,0.011714
3,18,0,0.245564,0.165263
4,23,0,0.009752,0.006881
...,...,...,...,...
27245,133642,0,0.009154,0.009384
27246,133644,0,0.001101,0.000251
27247,133651,0,0.002205,0.000863
27248,133662,0,0.009559,0.015232


In [139]:
submission[ 'next_month_revenue' ] = np.maximum(submission['revenue_next_month'], 0) + np.maximum(submission['ads_revenue_next_month'], 0)

submission = submission[ ['id', 'is_active', 'next_month_revenue'] ]


In [140]:
submission

Unnamed: 0,id,is_active,next_month_revenue
0,6,1,0.060986
1,10,1,59.583643
2,11,0,0.017209
3,18,0,0.410826
4,23,0,0.016633
...,...,...,...
27245,133642,0,0.018538
27246,133644,0,0.001352
27247,133651,0,0.003068
27248,133662,0,0.024791


In [141]:
sub = pd.read_csv('task_g/history.csv')
sub = sub.drop_duplicates(subset='id', keep='first')
sub = sub[ ['id'] ]

sub.sort_values('id', inplace=True)

In [142]:
sub['is_active'] = 0
sub['next_month_revenue'] = 0.0

sub = sub.set_index('id', drop=False)

In [143]:
sub.update(submission.set_index("id"))

In [144]:
sub = sub[ ['id', 'next_month_revenue', 'is_active'] ]

sub

Unnamed: 0_level_0,id,next_month_revenue,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
5,5,0.0,0
...,...,...,...
133670,133670,0.0,0
133671,133671,0.0,0
133672,133672,0.0,0
133673,133673,0.0,0


In [145]:
sub.to_csv('task_g/submission.csv', index=False)