In [1]:
import pandas as pd
import numpy as np
import catboost as cb

In [2]:
def get_next_month(date: str) -> str:
    return str(pd.Period(date, freq="M") + 1)



## Cleanup the data

In [3]:
df = pd.read_csv('training.csv')

In [4]:
df

Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,wins_pvp,...,current_avg_ping,current_month,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,log_revenue_current_month,log_ads_revenue_current_month
0,1,13,81,paid,iOS,100,21.373946,23.33529,265,205,...,0.275935,2024-07,0.0,0.0,0.0,0.0,False,1,3.191928,3.107897
1,1,1,81,paid,iOS,1,0.867439,8.87251,18,7,...,0.263144,2024-09,0.0,0.0,0.0,0.0,False,3,2.289754,0.624568
2,2,1,193,paid,Android,0,0.000000,0.00000,0,0,...,0.000000,2024-10,0.0,0.0,0.0,0.0,False,4,0.000000,0.000000
3,3,7,180,paid,Android,0,0.000000,0.00000,26,20,...,0.430013,2024-12,0.0,0.0,0.0,0.0,True,0,0.000000,0.000000
4,3,2,180,paid,Android,0,0.000000,0.00000,1,0,...,0.374937,2025-01,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190247,133672,2,173,paid,Android,0,0.000000,0.00000,9,2,...,0.223729,2025-01,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
190248,133673,14,49,paid,Android,32,0.042859,0.00000,161,90,...,0.359385,2024-10,0.0,0.0,0.0,0.0,False,0,0.000000,0.041966
190249,133674,1,208,paid,Android,0,0.000000,0.00000,0,0,...,0.000000,2025-01,0.0,0.0,0.0,0.0,False,8,0.000000,0.000000
190250,133674,2,208,paid,Android,2,0.137498,0.00000,5,1,...,0.151158,2024-07,0.0,0.0,0.0,0.0,False,2,0.000000,0.128831


In [5]:
# training `ads_revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["ads_revenue_next_month"]

features_names = X.columns.tolist()
cat_features_names = ['country', 'traffic_type', 'platform']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=features_names)
val_pool   = cb.Pool(X_val, y_val, cat_features=cat_features_names, feature_names=features_names)

model_ads_revenue_next_month = cb.CatBoostRegressor(
    iterations=2500,
    learning_rate=0.01,
    loss_function='RMSE',
    subsample=0.8,
    l2_leaf_reg=2,
)

model_ads_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4137110	test: 0.4111640	best: 0.4111640 (0)	total: 68.4ms	remaining: 2m 50s
100:	learn: 0.3217083	test: 0.3216903	best: 0.3216903 (100)	total: 1.02s	remaining: 24.3s
200:	learn: 0.3035067	test: 0.3046061	best: 0.3046061 (200)	total: 1.93s	remaining: 22.1s
300:	learn: 0.2990358	test: 0.3007689	best: 0.3007689 (300)	total: 2.92s	remaining: 21.4s
400:	learn: 0.2971982	test: 0.2994828	best: 0.2994828 (400)	total: 3.83s	remaining: 20.1s
500:	learn: 0.2959378	test: 0.2988787	best: 0.2988784 (498)	total: 4.77s	remaining: 19s
600:	learn: 0.2948601	test: 0.2984758	best: 0.2984758 (600)	total: 5.67s	remaining: 17.9s
700:	learn: 0.2939116	test: 0.2981677	best: 0.2981677 (700)	total: 6.52s	remaining: 16.7s
800:	learn: 0.2930746	test: 0.2978942	best: 0.2978942 (800)	total: 7.37s	remaining: 15.6s
900:	learn: 0.2923375	test: 0.2976879	best: 0.2976879 (900)	total: 8.24s	remaining: 14.6s
1000:	learn: 0.2916062	test: 0.2975555	best: 0.2975507 (999)	total: 9.14s	remaining: 13.7s
1100:	learn: 

<catboost.core.CatBoostRegressor at 0x70cff6ecf710>

In [6]:
imp = list(zip(model_ads_revenue_next_month.feature_names_, model_ads_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('currency_1', 18.95417373854969),
 ('currency_2', 17.09363295904768),
 ('ads_revenue_current_month', 14.331712729064492),
 ('log_ads_revenue_current_month', 13.884755736231968),
 ('current_passed_level', 7.563139009291327),
 ('logins_current_month', 5.625870985537984),
 ('ads_shown_current_month', 3.5014647872067637),
 ('months_after_reg', 3.184524514570398),
 ('quests', 1.9369936305781097),
 ('current_avg_ping', 1.4384799626406106),
 ('games_pvp', 1.3394379742972624),
 ('offers', 1.2324919440577253),
 ('currency_3', 1.2054872014310627),
 ('log_revenue_current_month', 1.1886664698455798),
 ('revenue_current_month', 1.170553189047631),
 ('wins_pvp', 1.1188730664251543),
 ('currency_6', 1.1146907447432062),
 ('currency_7', 1.0896043688389834),
 ('hard_quests', 0.8762672827160383),
 ('currency_4', 0.8326319305780122),
 ('country', 0.5863500842055099),
 ('currency_5', 0.4592927502287674),
 ('platform', 0.14820675785281281),
 ('traffic_type', 0.12269818301323089)]

In [7]:
# trainig `revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["revenue_next_month"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_revenue_next_month = cb.CatBoostRegressor(
    iterations=5000,
    learning_rate=0.002,
    loss_function='RMSE',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 6.2195905	test: 5.3685687	best: 5.3685687 (0)	total: 10.9ms	remaining: 54.7s
100:	learn: 5.9551454	test: 5.2193508	best: 5.2193508 (100)	total: 910ms	remaining: 44.1s
200:	learn: 5.7424218	test: 5.1131398	best: 5.1131398 (200)	total: 1.84s	remaining: 44s
300:	learn: 5.5695100	test: 5.0310713	best: 5.0310713 (300)	total: 2.77s	remaining: 43.3s
400:	learn: 5.4265468	test: 4.9657267	best: 4.9657267 (400)	total: 3.71s	remaining: 42.5s
500:	learn: 5.3080959	test: 4.9242666	best: 4.9242666 (500)	total: 4.68s	remaining: 42s
600:	learn: 5.2092113	test: 4.8968937	best: 4.8968937 (600)	total: 5.6s	remaining: 41s
700:	learn: 5.1255958	test: 4.8793863	best: 4.8793863 (700)	total: 6.55s	remaining: 40.2s
800:	learn: 5.0523795	test: 4.8677781	best: 4.8677289 (799)	total: 7.47s	remaining: 39.2s
900:	learn: 4.9857709	test: 4.8612145	best: 4.8611626 (899)	total: 8.43s	remaining: 38.4s
1000:	learn: 4.9302845	test: 4.8598101	best: 4.8598101 (1000)	total: 9.36s	remaining: 37.4s
1100:	learn: 4.880

<catboost.core.CatBoostRegressor at 0x70cff6a57fe0>

In [8]:
# trainig `is_active_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["is_active_next_month"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_is_active_next_month = cb.CatBoostClassifier(
    iterations=4000,
    learning_rate=0.025,
    loss_function='Logloss',
    subsample=0.9,
    l2_leaf_reg=3,
)

model_is_active_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6758415	test: 0.6758097	best: 0.6758097 (0)	total: 14.3ms	remaining: 57.1s
100:	learn: 0.4069993	test: 0.4066582	best: 0.4066582 (100)	total: 1.38s	remaining: 53.3s
200:	learn: 0.4005184	test: 0.4006778	best: 0.4006778 (200)	total: 2.67s	remaining: 50.5s
300:	learn: 0.3975600	test: 0.3983928	best: 0.3983928 (300)	total: 4.02s	remaining: 49.4s
400:	learn: 0.3955390	test: 0.3970939	best: 0.3970939 (400)	total: 5.36s	remaining: 48.1s
500:	learn: 0.3937247	test: 0.3960965	best: 0.3960965 (500)	total: 6.85s	remaining: 47.8s
600:	learn: 0.3918897	test: 0.3952487	best: 0.3952487 (600)	total: 8.32s	remaining: 47.1s
700:	learn: 0.3903908	test: 0.3946917	best: 0.3946883 (699)	total: 9.7s	remaining: 45.6s
800:	learn: 0.3889801	test: 0.3943293	best: 0.3943293 (800)	total: 11.1s	remaining: 44.3s
900:	learn: 0.3877402	test: 0.3939784	best: 0.3939784 (900)	total: 12.5s	remaining: 42.9s
1000:	learn: 0.3865705	test: 0.3937788	best: 0.3937766 (999)	total: 13.8s	remaining: 41.4s
1100:	learn: 

<catboost.core.CatBoostClassifier at 0x70d03979c2f0>

In [9]:
model_is_active_next_month.predict(df[model_is_active_next_month.feature_names_].loc[0])

np.True_

# Predict

In [10]:
prediction = pd.read_csv('predict.csv')

In [11]:
prediction

Unnamed: 0,id,current_month,id.1,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,...,current_avg_ping,current_month.1,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,log_revenue_current_month,log_ads_revenue_current_month
0,6,2025-04,6,29,133,organic,Android,66,0.040797,0.000000,...,0.277034,2025-04,0.0,0.0,0.0,0.0,False,3,0.000000,0.039987
1,10,2025-04,10,30,164,paid,iOS,781,26.512756,121.358247,...,0.049059,2025-04,0.0,0.0,0.0,0.0,False,1,4.806953,3.314650
2,11,2025-04,11,7,93,organic,Android,3,0.021389,0.000000,...,0.325849,2025-04,0.0,0.0,0.0,0.0,False,0,0.000000,0.021164
3,18,2025-04,18,9,112,organic,iOS,67,2.316753,5.058957,...,0.396381,2025-04,0.0,0.0,0.0,0.0,False,0,1.801538,1.198986
4,23,2025-04,23,13,67,paid,Android,19,0.145450,0.000000,...,0.209484,2025-04,0.0,0.0,0.0,0.0,False,1,0.000000,0.135797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27245,133642,2025-04,133642,1,101,paid,Android,0,0.000000,0.000000,...,0.195539,2025-04,0.0,0.0,0.0,0.0,False,5,0.000000,0.000000
27246,133644,2025-04,133644,1,161,paid,Android,0,0.000000,0.000000,...,0.652246,2025-04,0.0,0.0,0.0,0.0,False,2,0.000000,0.000000
27247,133651,2025-04,133651,1,8,paid,Android,0,0.000000,0.000000,...,0.000000,2025-04,0.0,0.0,0.0,0.0,False,1,0.000000,0.000000
27248,133662,2025-04,133662,2,164,paid,Android,2,0.002376,0.000000,...,0.172545,2025-04,0.0,0.0,0.0,0.0,False,4,0.000000,0.002374


In [12]:
prediction['is_active'] = np.int8(model_is_active_next_month.predict(prediction[model_is_active_next_month.feature_names_]))
prediction['revenue_next_month'] = model_revenue_next_month.predict(prediction[model_revenue_next_month.feature_names_])
prediction['ads_revenue_next_month'] = model_ads_revenue_next_month.predict(prediction[model_ads_revenue_next_month.feature_names_])

submission = prediction[ ['id', 'is_active', 'revenue_next_month', 'ads_revenue_next_month'] ].copy()

submission

Unnamed: 0,id,is_active,revenue_next_month,ads_revenue_next_month
0,6,1,0.275301,0.061475
1,10,1,16.970745,1.630964
2,11,0,0.097595,0.011354
3,18,0,0.551801,0.136804
4,23,0,0.204802,0.026818
...,...,...,...,...
27245,133642,0,0.106241,0.011292
27246,133644,0,0.081327,0.005451
27247,133651,0,0.089113,0.003097
27248,133662,0,0.105664,0.009405


In [13]:
submission[ 'next_month_revenue' ] = np.maximum(submission['revenue_next_month'], 0) + np.maximum(submission['ads_revenue_next_month'], 0)

submission = submission[ ['id', 'is_active', 'next_month_revenue'] ]


In [14]:
submission

Unnamed: 0,id,is_active,next_month_revenue
0,6,1,0.336776
1,10,1,18.601709
2,11,0,0.108950
3,18,0,0.688605
4,23,0,0.231620
...,...,...,...
27245,133642,0,0.117533
27246,133644,0,0.086778
27247,133651,0,0.092210
27248,133662,0,0.115069


In [15]:
sub = pd.read_csv('history.csv')
sub = sub.drop_duplicates(subset='id', keep='first')
sub = sub[ ['id'] ]

sub.sort_values('id', inplace=True)

In [16]:
sub['is_active'] = 0
sub['next_month_revenue'] = 0.0

sub = sub.set_index('id', drop=False)

In [17]:
sub.update(submission.set_index("id"))

In [18]:
sub = sub[ ['id', 'next_month_revenue', 'is_active'] ]

sub

Unnamed: 0_level_0,id,next_month_revenue,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
5,5,0.0,0
...,...,...,...
133670,133670,0.0,0
133671,133671,0.0,0
133672,133672,0.0,0
133673,133673,0.0,0


In [19]:
sub.to_csv('submission.csv', index=False)