In [194]:
import pandas as pd
import numpy as np
import catboost as cb

In [195]:
def get_next_month(date: str) -> str:
    return str(pd.Period(date, freq="M") + 1)



## Cleanup the data

In [196]:
df = pd.read_csv('training.csv')

In [197]:
df

Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,wins_pvp,...,currency_4,currency_5,currency_6,currency_7,current_avg_ping,current_month,ads_revenue_next_month,revenue_next_month,is_active_next_month,months_after_reg
0,1,13,81,paid,iOS,100,21.373946,23.33529,265,205,...,26,0,0,0,0.275935,2024-07,0.0,0.0,False,1
1,1,1,81,paid,iOS,1,0.867439,8.87251,18,7,...,0,0,0,0,0.263144,2024-09,0.0,0.0,False,3
2,2,1,193,paid,Android,0,0.000000,0.00000,0,0,...,0,0,0,0,0.000000,2024-10,0.0,0.0,False,4
3,3,7,180,paid,Android,0,0.000000,0.00000,26,20,...,0,0,0,0,0.430013,2024-12,0.0,0.0,True,0
4,3,2,180,paid,Android,0,0.000000,0.00000,1,0,...,0,0,0,0,0.374937,2025-01,0.0,0.0,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196877,133672,2,173,paid,Android,0,0.000000,0.00000,9,2,...,0,0,0,0,0.223729,2025-01,0.0,0.0,False,1
196878,133673,14,49,paid,Android,32,0.042859,0.00000,161,90,...,0,0,0,0,0.359385,2024-10,0.0,0.0,False,0
196879,133674,1,208,paid,Android,0,0.000000,0.00000,0,0,...,0,0,0,0,0.000000,2025-01,0.0,0.0,False,8
196880,133674,2,208,paid,Android,2,0.137498,0.00000,5,1,...,0,0,0,0,0.151158,2024-07,0.0,0.0,False,2


In [198]:
# training `ads_revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[ "current_month", "id", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month"])
y = df["ads_revenue_next_month"]

features_names = X.columns.tolist()
cat_features_names = ['country', 'traffic_type', 'platform']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=features_names)
val_pool   = cb.Pool(X_val, y_val, cat_features=cat_features_names, feature_names=features_names)

model_ads_revenue_next_month = cb.CatBoostRegressor(
    iterations=2500,
    learning_rate=0.01,
    loss_function='RMSE',
    subsample=0.8,
    l2_leaf_reg=2,
)

model_ads_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 3.2857453	test: 2.7102299	best: 2.7102299 (0)	total: 14.4ms	remaining: 35.9s
100:	learn: 2.5656721	test: 2.0744123	best: 2.0744123 (100)	total: 800ms	remaining: 19s
200:	learn: 2.3602082	test: 1.9676247	best: 1.9676247 (200)	total: 1.56s	remaining: 17.8s
300:	learn: 2.2887175	test: 1.9480814	best: 1.9480814 (300)	total: 2.31s	remaining: 16.9s
400:	learn: 2.2435903	test: 1.9436651	best: 1.9436106 (399)	total: 3.06s	remaining: 16s
500:	learn: 2.2054596	test: 1.9372738	best: 1.9372738 (500)	total: 3.81s	remaining: 15.2s
600:	learn: 2.1642036	test: 1.9356547	best: 1.9355705 (583)	total: 4.59s	remaining: 14.5s
700:	learn: 2.1263726	test: 1.9332049	best: 1.9332049 (700)	total: 5.35s	remaining: 13.7s
800:	learn: 2.0935854	test: 1.9317208	best: 1.9311959 (787)	total: 6.12s	remaining: 13s
900:	learn: 2.0560845	test: 1.9293092	best: 1.9290614 (899)	total: 6.95s	remaining: 12.3s
1000:	learn: 2.0193738	test: 1.9288882	best: 1.9287204 (950)	total: 7.75s	remaining: 11.6s
Stopped by overfit

<catboost.core.CatBoostRegressor at 0x7cad26a1e8d0>

In [199]:
imp = list(zip(model_ads_revenue_next_month.feature_names_, model_ads_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('ads_revenue_current_month', 41.61492316550645),
 ('currency_1', 10.46702957825998),
 ('revenue_current_month', 9.466310713894678),
 ('currency_2', 7.707017808174541),
 ('current_avg_ping', 4.576335201900617),
 ('logins_current_month', 4.003357858231234),
 ('offers', 3.888565684851538),
 ('current_passed_level', 2.901046968091883),
 ('currency_5', 2.732645296844093),
 ('currency_3', 2.140507506640305),
 ('quests', 1.9744279977175205),
 ('ads_shown_current_month', 1.5304764805004993),
 ('hard_quests', 1.2268518244346205),
 ('currency_6', 1.1816942969338902),
 ('months_after_reg', 1.1212082462303619),
 ('wins_pvp', 0.7779832044229383),
 ('games_pvp', 0.6587969922768591),
 ('currency_4', 0.5901673709721192),
 ('currency_7', 0.5899446420204199),
 ('platform', 0.5630413477657614),
 ('traffic_type', 0.27660941270379596),
 ('country', 0.011058401625901163)]

In [200]:
# trainig `revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[ "current_month", "id", "is_active_next_month", "revenue_next_month", "ads_revenue_next_month"])
y = df["revenue_next_month"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_revenue_next_month = cb.CatBoostRegressor(
    iterations=5000,
    learning_rate=0.002,
    loss_function='RMSE',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 26.7344637	test: 26.5558730	best: 26.5558730 (0)	total: 10.1ms	remaining: 50.7s
100:	learn: 24.7528073	test: 24.9530673	best: 24.9530673 (100)	total: 789ms	remaining: 38.3s
200:	learn: 23.2052009	test: 23.8224557	best: 23.8224557 (200)	total: 1.6s	remaining: 38.2s
300:	learn: 21.9429182	test: 22.9587689	best: 22.9587689 (300)	total: 2.42s	remaining: 37.8s
400:	learn: 20.9255995	test: 22.3250136	best: 22.3250136 (400)	total: 3.25s	remaining: 37.3s
500:	learn: 20.1072831	test: 21.8612507	best: 21.8612507 (500)	total: 4.07s	remaining: 36.5s
600:	learn: 19.4371767	test: 21.5171403	best: 21.5171403 (600)	total: 4.87s	remaining: 35.7s
700:	learn: 18.8731726	test: 21.2644123	best: 21.2644123 (700)	total: 5.7s	remaining: 35s
800:	learn: 18.3893038	test: 21.0695295	best: 21.0695295 (800)	total: 6.51s	remaining: 34.1s
900:	learn: 17.9581471	test: 20.9144055	best: 20.9144055 (900)	total: 7.31s	remaining: 33.2s
1000:	learn: 17.5768912	test: 20.7881247	best: 20.7881247 (1000)	total: 8.09s

<catboost.core.CatBoostRegressor at 0x7cad2f24d970>

In [201]:
# trainig `is_active_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=["current_month", "id", "is_active_next_month", "ads_revenue_next_month", 'revenue_next_month'])
y = df["is_active_next_month"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_is_active_next_month = cb.CatBoostClassifier(
    iterations=4000,
    learning_rate=0.025,
    loss_function='Logloss',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_is_active_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6756998	test: 0.6757156	best: 0.6757156 (0)	total: 14.8ms	remaining: 59.1s
100:	learn: 0.3997394	test: 0.4002172	best: 0.4002172 (100)	total: 1.47s	remaining: 56.8s
200:	learn: 0.3931507	test: 0.3947370	best: 0.3947370 (200)	total: 2.85s	remaining: 53.8s
300:	learn: 0.3903429	test: 0.3927936	best: 0.3927936 (300)	total: 4.16s	remaining: 51.1s
400:	learn: 0.3882572	test: 0.3915463	best: 0.3915463 (400)	total: 5.51s	remaining: 49.5s
500:	learn: 0.3865788	test: 0.3907882	best: 0.3907860 (499)	total: 7.02s	remaining: 49s
600:	learn: 0.3849638	test: 0.3902060	best: 0.3902023 (599)	total: 8.4s	remaining: 47.5s
700:	learn: 0.3835749	test: 0.3896914	best: 0.3896914 (700)	total: 9.72s	remaining: 45.7s
800:	learn: 0.3823576	test: 0.3893460	best: 0.3893460 (800)	total: 11s	remaining: 44.1s
900:	learn: 0.3812326	test: 0.3890955	best: 0.3890955 (900)	total: 12.3s	remaining: 42.3s
1000:	learn: 0.3801303	test: 0.3888358	best: 0.3888341 (998)	total: 13.6s	remaining: 40.8s
1100:	learn: 0.37

<catboost.core.CatBoostClassifier at 0x7cad6417c200>

In [202]:
model_is_active_next_month.predict(df[model_is_active_next_month.feature_names_].loc[0])

np.True_

# Predict

In [226]:
prediction = pd.read_csv('predict.csv')

In [227]:
prediction

Unnamed: 0,id,current_month,id.1,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,...,currency_4,currency_5,currency_6,currency_7,current_avg_ping,current_month.1,ads_revenue_next_month,revenue_next_month,is_active_next_month,months_after_reg
0,6,2025-04,6,29,133,organic,Android,66,0.040797,0.000000,...,0,0,0,0,0.277034,2025-04,0.0,0.0,False,3
1,10,2025-04,10,30,164,paid,iOS,781,26.512756,121.358247,...,85,220,97750,5350,0.049059,2025-04,0.0,0.0,False,1
2,11,2025-04,11,7,93,organic,Android,3,0.021389,0.000000,...,0,0,0,0,0.325849,2025-04,0.0,0.0,False,0
3,18,2025-04,18,9,112,organic,iOS,67,2.316753,5.058957,...,0,0,0,0,0.396381,2025-04,0.0,0.0,False,0
4,23,2025-04,23,13,67,paid,Android,19,0.145450,0.000000,...,0,0,0,0,0.209484,2025-04,0.0,0.0,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27245,133642,2025-04,133642,1,101,paid,Android,0,0.000000,0.000000,...,0,0,0,0,0.195539,2025-04,0.0,0.0,False,5
27246,133644,2025-04,133644,1,161,paid,Android,0,0.000000,0.000000,...,0,0,0,0,0.652246,2025-04,0.0,0.0,False,2
27247,133651,2025-04,133651,1,8,paid,Android,0,0.000000,0.000000,...,0,0,0,0,0.000000,2025-04,0.0,0.0,False,1
27248,133662,2025-04,133662,2,164,paid,Android,2,0.002376,0.000000,...,0,0,0,0,0.172545,2025-04,0.0,0.0,False,4


In [228]:
prediction['is_active'] = np.int8(model_is_active_next_month.predict(prediction[model_is_active_next_month.feature_names_]))
prediction['revenue_next_month'] = model_revenue_next_month.predict(prediction[model_revenue_next_month.feature_names_])
prediction['ads_revenue_next_month'] = model_ads_revenue_next_month.predict(prediction[model_ads_revenue_next_month.feature_names_])

submission = prediction[ ['id', 'is_active', 'revenue_next_month', 'ads_revenue_next_month'] ].copy()

submission

Unnamed: 0,id,is_active,revenue_next_month,ads_revenue_next_month
0,6,1,0.330876,0.034980
1,10,1,101.755163,23.520253
2,11,0,-0.009403,0.024897
3,18,0,0.982806,0.561370
4,23,0,0.055725,0.007664
...,...,...,...,...
27245,133642,0,0.047591,0.028054
27246,133644,0,-0.013337,0.011942
27247,133651,0,0.018559,0.009763
27248,133662,0,0.054540,0.027950


In [229]:
submission[ 'next_month_revenue' ] = np.maximum(submission['revenue_next_month'], 0) + np.maximum(submission['ads_revenue_next_month'], 0)

submission = submission[ ['id', 'is_active', 'next_month_revenue'] ]


In [230]:
submission

Unnamed: 0,id,is_active,next_month_revenue
0,6,1,0.365856
1,10,1,125.275416
2,11,0,0.024897
3,18,0,1.544176
4,23,0,0.063390
...,...,...,...
27245,133642,0,0.075645
27246,133644,0,0.011942
27247,133651,0,0.028323
27248,133662,0,0.082491


In [231]:
sub = pd.read_csv('history.csv')
sub = sub.drop_duplicates(subset='id', keep='first')
sub = sub[ ['id'] ]

sub.sort_values('id', inplace=True)

In [232]:
sub['is_active'] = 0
sub['next_month_revenue'] = 0.0

sub = sub.set_index('id', drop=False)

In [233]:
sub.update(submission.set_index("id"))

In [234]:
sub = sub[ ['id', 'next_month_revenue', 'is_active'] ]

sub

Unnamed: 0_level_0,id,next_month_revenue,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
5,5,0.0,0
...,...,...,...
133670,133670,0.0,0
133671,133671,0.0,0
133672,133672,0.0,0
133673,133673,0.0,0


In [235]:
sub.to_csv('submission.csv', index=False)