In [46]:
import pandas as pd
import numpy as np
import catboost as cb

In [47]:
def get_next_month(date: str) -> str:
    return str(pd.Period(date, freq="M") + 1)



## Cleanup the data

In [48]:
df = pd.read_csv('task_g/training.csv')

In [49]:
df

Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,wins_pvp,...,current_avg_ping,current_month,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,log_revenue_current_month,log_ads_revenue_current_month
0,1,13,81,paid,iOS,100,21.373946,23.33529,265,205,...,0.275935,2024-07,0.0,0.0,0.0,0.0,False,1,3.191928,3.107897
1,1,1,81,paid,iOS,1,0.867439,8.87251,18,7,...,0.263144,2024-09,0.0,0.0,0.0,0.0,False,3,2.289754,0.624568
2,2,1,193,paid,Android,0,0.000000,0.00000,0,0,...,0.000000,2024-10,0.0,0.0,0.0,0.0,False,4,0.000000,0.000000
3,3,7,180,paid,Android,0,0.000000,0.00000,26,20,...,0.430013,2024-12,0.0,0.0,0.0,0.0,True,0,0.000000,0.000000
4,3,2,180,paid,Android,0,0.000000,0.00000,1,0,...,0.374937,2025-01,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196877,133672,2,173,paid,Android,0,0.000000,0.00000,9,2,...,0.223729,2025-01,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
196878,133673,14,49,paid,Android,32,0.042859,0.00000,161,90,...,0.359385,2024-10,0.0,0.0,0.0,0.0,False,0,0.000000,0.041966
196879,133674,1,208,paid,Android,0,0.000000,0.00000,0,0,...,0.000000,2025-01,0.0,0.0,0.0,0.0,False,8,0.000000,0.000000
196880,133674,2,208,paid,Android,2,0.137498,0.00000,5,1,...,0.151158,2024-07,0.0,0.0,0.0,0.0,False,2,0.000000,0.128831


In [50]:
# training `ads_revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_ads_revenue_next_month"]

features_names = X.columns.tolist()
cat_features_names = ['country', 'traffic_type', 'platform']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=features_names)
val_pool   = cb.Pool(X_val, y_val, cat_features=cat_features_names, feature_names=features_names)

model_ads_revenue_next_month = cb.CatBoostRegressor(
    iterations=2500,
    learning_rate=0.01,
    loss_function='RMSE',
    subsample=0.8,
    l2_leaf_reg=2,
)

model_ads_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4513387	test: 0.4479678	best: 0.4479678 (0)	total: 8.75ms	remaining: 21.9s
100:	learn: 0.3101217	test: 0.3087410	best: 0.3087410 (100)	total: 860ms	remaining: 20.4s
200:	learn: 0.2794441	test: 0.2794698	best: 0.2794698 (200)	total: 1.73s	remaining: 19.8s
300:	learn: 0.2718542	test: 0.2727575	best: 0.2727575 (300)	total: 2.59s	remaining: 18.9s
400:	learn: 0.2688656	test: 0.2706029	best: 0.2706029 (400)	total: 3.46s	remaining: 18.1s
500:	learn: 0.2672605	test: 0.2696026	best: 0.2696026 (500)	total: 4.29s	remaining: 17.1s
600:	learn: 0.2659363	test: 0.2690028	best: 0.2690028 (600)	total: 5.11s	remaining: 16.1s
700:	learn: 0.2648026	test: 0.2685014	best: 0.2685014 (700)	total: 5.94s	remaining: 15.2s
800:	learn: 0.2637545	test: 0.2680939	best: 0.2680939 (800)	total: 6.79s	remaining: 14.4s
900:	learn: 0.2627781	test: 0.2677130	best: 0.2677130 (900)	total: 7.6s	remaining: 13.5s
1000:	learn: 0.2618477	test: 0.2674444	best: 0.2674444 (1000)	total: 8.42s	remaining: 12.6s
1100:	learn:

<catboost.core.CatBoostRegressor at 0x70cff6a54140>

In [51]:
imp = list(zip(model_ads_revenue_next_month.feature_names_, model_ads_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('currency_1', 21.227223603636183),
 ('ads_revenue_current_month', 18.963764351322997),
 ('log_ads_revenue_current_month', 17.655561261082152),
 ('currency_2', 13.340828739215631),
 ('current_passed_level', 6.313261601819904),
 ('logins_current_month', 5.017475922035451),
 ('months_after_reg', 2.0616897032596495),
 ('revenue_current_month', 1.759633695228549),
 ('log_revenue_current_month', 1.6176948186853328),
 ('quests', 1.6116801746912823),
 ('offers', 1.5911281999279936),
 ('ads_shown_current_month', 1.5292010158495577),
 ('current_avg_ping', 1.277188709683949),
 ('games_pvp', 1.0291753924981557),
 ('country', 0.8577088200163947),
 ('currency_6', 0.6861401010725715),
 ('wins_pvp', 0.6714812153848704),
 ('hard_quests', 0.6159135601984752),
 ('currency_3', 0.5457192764361145),
 ('currency_4', 0.5423304838023599),
 ('currency_7', 0.4372793062735931),
 ('platform', 0.28399938622332893),
 ('currency_5', 0.2808902054789996),
 ('traffic_type', 0.08303045617646486)]

In [52]:
# trainig `revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_revenue_next_month"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_revenue_next_month = cb.CatBoostRegressor(
    iterations=5000,
    learning_rate=0.002,
    loss_function='RMSE',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6123310	test: 0.6177134	best: 0.6177134 (0)	total: 9.63ms	remaining: 48.1s
100:	learn: 0.5586142	test: 0.5650401	best: 0.5650401 (100)	total: 919ms	remaining: 44.6s
200:	learn: 0.5188976	test: 0.5261983	best: 0.5261983 (200)	total: 1.84s	remaining: 44s
300:	learn: 0.4899720	test: 0.4980401	best: 0.4980401 (300)	total: 2.77s	remaining: 43.3s
400:	learn: 0.4690428	test: 0.4777503	best: 0.4777503 (400)	total: 3.7s	remaining: 42.4s
500:	learn: 0.4540532	test: 0.4632667	best: 0.4632667 (500)	total: 4.58s	remaining: 41.1s
600:	learn: 0.4432672	test: 0.4529242	best: 0.4529242 (600)	total: 5.51s	remaining: 40.3s
700:	learn: 0.4355373	test: 0.4455755	best: 0.4455755 (700)	total: 6.42s	remaining: 39.4s
800:	learn: 0.4299792	test: 0.4404005	best: 0.4404005 (800)	total: 7.37s	remaining: 38.6s
900:	learn: 0.4258934	test: 0.4366734	best: 0.4366734 (900)	total: 8.3s	remaining: 37.7s
1000:	learn: 0.4228591	test: 0.4339777	best: 0.4339777 (1000)	total: 9.24s	remaining: 36.9s
1100:	learn: 0.

<catboost.core.CatBoostRegressor at 0x70cff6a8a210>

In [53]:
imp = list(zip(model_revenue_next_month.feature_names_, model_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('revenue_current_month', 23.423133800936238),
 ('log_revenue_current_month', 22.957218505447653),
 ('currency_1', 15.600963355986256),
 ('currency_2', 11.672951109776983),
 ('current_passed_level', 4.994210007344903),
 ('offers', 3.516851644277611),
 ('log_ads_revenue_current_month', 2.1990133090200223),
 ('logins_current_month', 2.1425558342765263),
 ('ads_revenue_current_month', 2.028695048235034),
 ('country', 1.5743399520879264),
 ('ads_shown_current_month', 1.3352387554705678),
 ('games_pvp', 1.2539444778143085),
 ('months_after_reg', 1.2434885813312024),
 ('quests', 1.147787472217301),
 ('current_avg_ping', 0.9473564259008688),
 ('currency_6', 0.7358513815555296),
 ('currency_4', 0.6937675655943437),
 ('hard_quests', 0.5668169118718278),
 ('wins_pvp', 0.5449173194531699),
 ('currency_3', 0.4623157404976772),
 ('currency_7', 0.3790246089992522),
 ('currency_5', 0.33092832144624745),
 ('platform', 0.16876915664233982),
 ('traffic_type', 0.07986071381621065)]

In [54]:
# trainig `is_active_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["is_active_next_month"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_is_active_next_month = cb.CatBoostClassifier(
    iterations=4000,
    learning_rate=0.025,
    loss_function='Logloss',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_is_active_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6755923	test: 0.6756663	best: 0.6756663 (0)	total: 17.8ms	remaining: 1m 11s
100:	learn: 0.3998326	test: 0.4003609	best: 0.4003609 (100)	total: 1.36s	remaining: 52.5s
200:	learn: 0.3932484	test: 0.3949722	best: 0.3949722 (200)	total: 2.69s	remaining: 50.9s
300:	learn: 0.3904300	test: 0.3931444	best: 0.3931444 (300)	total: 4s	remaining: 49.2s
400:	learn: 0.3884895	test: 0.3919740	best: 0.3919740 (400)	total: 5.31s	remaining: 47.6s
500:	learn: 0.3867476	test: 0.3911122	best: 0.3911122 (500)	total: 6.66s	remaining: 46.5s
600:	learn: 0.3850490	test: 0.3903357	best: 0.3903357 (600)	total: 7.98s	remaining: 45.1s
700:	learn: 0.3837120	test: 0.3898689	best: 0.3898688 (699)	total: 9.27s	remaining: 43.6s
800:	learn: 0.3825269	test: 0.3895229	best: 0.3895229 (800)	total: 10.6s	remaining: 42.2s
900:	learn: 0.3813992	test: 0.3892151	best: 0.3892144 (898)	total: 11.9s	remaining: 40.8s
1000:	learn: 0.3802896	test: 0.3889333	best: 0.3889317 (997)	total: 13.2s	remaining: 39.5s
1100:	learn: 0

<catboost.core.CatBoostClassifier at 0x70cff652b4a0>

In [55]:
imp = list(zip(model_is_active_next_month.feature_names_, model_is_active_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('currency_1', 25.255137842908372),
 ('logins_current_month', 15.537154036110854),
 ('currency_2', 12.519515119850682),
 ('current_passed_level', 8.543686811222097),
 ('months_after_reg', 6.5203479229986305),
 ('current_avg_ping', 5.304040717827397),
 ('quests', 4.131150103428365),
 ('games_pvp', 4.0547650418465535),
 ('wins_pvp', 2.574952251974163),
 ('ads_shown_current_month', 2.464327349133731),
 ('country', 2.3433717671692764),
 ('platform', 1.8506973295287552),
 ('ads_revenue_current_month', 1.5341258245248295),
 ('offers', 1.4586243711229447),
 ('log_ads_revenue_current_month', 1.2409526397736843),
 ('hard_quests', 1.176816410924323),
 ('currency_3', 0.5537921470186354),
 ('log_revenue_current_month', 0.5380079450542609),
 ('currency_7', 0.5318299695450306),
 ('traffic_type', 0.5100468403189307),
 ('revenue_current_month', 0.48055751543991987),
 ('currency_5', 0.38000236240359125),
 ('currency_6', 0.3140192928006526),
 ('currency_4', 0.18207838707425975)]

# Predict

In [56]:
prediction = pd.read_csv('task_g/predict.csv')

In [57]:
prediction

Unnamed: 0,id,current_month,id.1,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,...,current_avg_ping,current_month.1,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,log_revenue_current_month,log_ads_revenue_current_month
0,6,2025-04,6,29,133,organic,Android,66,0.040797,0.000000,...,0.277034,2025-04,0.0,0.0,0.0,0.0,False,3,0.000000,0.039987
1,10,2025-04,10,30,164,paid,iOS,781,26.512756,121.358247,...,0.049059,2025-04,0.0,0.0,0.0,0.0,False,1,4.806953,3.314650
2,11,2025-04,11,7,93,organic,Android,3,0.021389,0.000000,...,0.325849,2025-04,0.0,0.0,0.0,0.0,False,0,0.000000,0.021164
3,18,2025-04,18,9,112,organic,iOS,67,2.316753,5.058957,...,0.396381,2025-04,0.0,0.0,0.0,0.0,False,0,1.801538,1.198986
4,23,2025-04,23,13,67,paid,Android,19,0.145450,0.000000,...,0.209484,2025-04,0.0,0.0,0.0,0.0,False,1,0.000000,0.135797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27245,133642,2025-04,133642,1,101,paid,Android,0,0.000000,0.000000,...,0.195539,2025-04,0.0,0.0,0.0,0.0,False,5,0.000000,0.000000
27246,133644,2025-04,133644,1,161,paid,Android,0,0.000000,0.000000,...,0.652246,2025-04,0.0,0.0,0.0,0.0,False,2,0.000000,0.000000
27247,133651,2025-04,133651,1,8,paid,Android,0,0.000000,0.000000,...,0.000000,2025-04,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
27248,133662,2025-04,133662,2,164,paid,Android,2,0.002376,0.000000,...,0.172545,2025-04,0.0,0.0,0.0,0.0,False,4,0.000000,0.002374


In [58]:
prediction['is_active'] = np.int8(model_is_active_next_month.predict(prediction[model_is_active_next_month.feature_names_]))
prediction['revenue_next_month'] = np.expm1(model_revenue_next_month.predict(prediction[model_revenue_next_month.feature_names_]))
prediction['ads_revenue_next_month'] = np.expm1(model_ads_revenue_next_month.predict(prediction[model_ads_revenue_next_month.feature_names_]))

submission = prediction[ ['id', 'is_active', 'revenue_next_month', 'ads_revenue_next_month'] ].copy()

submission

Unnamed: 0,id,is_active,revenue_next_month,ads_revenue_next_month
0,6,1,0.007106,0.053087
1,10,1,46.109331,14.856674
2,11,0,0.005815,0.011714
3,18,0,0.277149,0.165263
4,23,0,0.006422,0.006881
...,...,...,...,...
27245,133642,0,0.010229,0.009384
27246,133644,0,0.001153,0.000251
27247,133651,0,0.001384,0.000863
27248,133662,0,0.010458,0.015232


In [59]:
submission[ 'next_month_revenue' ] = np.maximum(submission['revenue_next_month'], 0) + np.maximum(submission['ads_revenue_next_month'], 0)

submission = submission[ ['id', 'is_active', 'next_month_revenue'] ]


In [60]:
submission

Unnamed: 0,id,is_active,next_month_revenue
0,6,1,0.060194
1,10,1,60.966005
2,11,0,0.017530
3,18,0,0.442411
4,23,0,0.013303
...,...,...,...
27245,133642,0,0.019614
27246,133644,0,0.001404
27247,133651,0,0.002246
27248,133662,0,0.025690


In [61]:
sub = pd.read_csv('task_g/history.csv')
sub = sub.drop_duplicates(subset='id', keep='first')
sub = sub[ ['id'] ]

sub.sort_values('id', inplace=True)

In [62]:
sub['is_active'] = 0
sub['next_month_revenue'] = 0.0

sub = sub.set_index('id', drop=False)

In [63]:
sub.update(submission.set_index("id"))

In [64]:
sub = sub[ ['id', 'next_month_revenue', 'is_active'] ]

sub

Unnamed: 0_level_0,id,next_month_revenue,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
5,5,0.0,0
...,...,...,...
133670,133670,0.0,0
133671,133671,0.0,0
133672,133672,0.0,0
133673,133673,0.0,0


In [65]:
sub.to_csv('task_g/submission.csv', index=False)