In [29]:
from datasets import Dataset
import catboost

In [4]:
rfsd = Dataset.load_from_disk('rfsd_actual_v3')
df = rfsd.to_pandas()

In [8]:
cat_cols = ['region', 'region_taxcode', 'okved', 'okved_section', 'okopf', 'okogu', 'okfc']
ignore_cols = ['inn']

target_column = 'workers_count'

In [9]:
active_and_filed = df[(df.filed == 1) & (df.region.isnull() == False)]
active_and_filed_and_with_workers = active_and_filed[active_and_filed.workers_count.isnull() == False]


In [19]:
nulls = active_and_filed_and_with_workers.isna().sum() / len(active_and_filed_and_with_workers) * 100

In [21]:
target_lines = [i for i in list(nulls[(nulls < 20)].index) if 'line' in i]

In [22]:
target_lines

['line_1200',
 'line_1230',
 'line_1250',
 'line_1300',
 'line_1500',
 'line_1520',
 'line_1600',
 'line_1700',
 'line_2110',
 'line_2120',
 'line_2200',
 'line_2300',
 'line_2350',
 'line_2400',
 'line_2500']

In [24]:
good_corr_lines = []
for col in target_lines:
    corr = active_and_filed_and_with_workers[col].corr(active_and_filed_and_with_workers.workers_count)
    if corr > 0.01:
        good_corr_lines.append(col)

good_corr_lines

['line_1230',
 'line_1250',
 'line_1500',
 'line_1520',
 'line_2110',
 'line_2120',
 'line_2200',
 'line_2300',
 'line_2350',
 'line_2400',
 'line_2500']

In [44]:
good_corr_lines = ['line_1230',
 'line_1250',
 'line_1500',
 'line_1520',
 'line_2110',
 'line_2120',
 'line_2200',
 'line_2300',
 'line_2350',
 'line_2400',
 'line_2500']

In [25]:
needed_cols = ['inn', 'region', 'region_taxcode', 'age', 'okved', 'okved_section', 'okopf', 'okogu', 'okfc'] + good_corr_lines + ['workers_count']


In [49]:
for i in needed_cols:
    print(i)

inn
region
region_taxcode
age
okved
okved_section
okopf
okogu
okfc
line_1230
line_1250
line_1500
line_1520
line_2110
line_2120
line_2200
line_2300
line_2350
line_2400
line_2500
workers_count


In [26]:
condition = True
for col in needed_cols:
    condition &= (active_and_filed_and_with_workers[col].isnull() == False)
final_filed_df = active_and_filed_and_with_workers[condition]

In [27]:
len(final_filed_df)

1242400

In [28]:
final_filed_minimized = final_filed_df[needed_cols]

In [31]:
cat_cols = ['region', 'region_taxcode', 'okved', 'okved_section', 'okopf', 'okogu', 'okfc']
ignore_cols = ['inn']
target_column = 'workers_count'

In [32]:
X = final_filed_minimized.drop(columns=[target_column])
y = final_filed_minimized[target_column]

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [37]:
model = catboost.CatBoostRegressor(iterations=1000, verbose=0, cat_features=cat_cols, ignored_features=ignore_cols)
model.fit(X_train, y_train, verbose=True)

Learning rate set to 0.12412
0:	learn: 75.4439943	total: 181ms	remaining: 3m
1:	learn: 72.8377802	total: 273ms	remaining: 2m 16s
2:	learn: 70.7049625	total: 385ms	remaining: 2m 7s
3:	learn: 68.9841208	total: 499ms	remaining: 2m 4s
4:	learn: 67.6012249	total: 589ms	remaining: 1m 57s
5:	learn: 66.4546358	total: 683ms	remaining: 1m 53s
6:	learn: 65.5115561	total: 760ms	remaining: 1m 47s
7:	learn: 64.7529861	total: 853ms	remaining: 1m 45s
8:	learn: 64.1453535	total: 952ms	remaining: 1m 44s
9:	learn: 63.5128611	total: 1.06s	remaining: 1m 44s
10:	learn: 62.9656614	total: 1.19s	remaining: 1m 46s
11:	learn: 62.5976460	total: 1.29s	remaining: 1m 46s
12:	learn: 62.1533825	total: 1.39s	remaining: 1m 45s
13:	learn: 61.8028591	total: 1.51s	remaining: 1m 46s
14:	learn: 61.4623590	total: 1.61s	remaining: 1m 45s
15:	learn: 61.1714074	total: 1.72s	remaining: 1m 45s
16:	learn: 60.8898490	total: 1.81s	remaining: 1m 44s
17:	learn: 60.6478930	total: 1.93s	remaining: 1m 45s
18:	learn: 60.3990154	total: 2.06

<catboost.core.CatBoostRegressor at 0x76b910734290>

In [38]:
list(zip(model.feature_names_,
model.feature_importances_))

[('inn', np.float64(0.0)),
 ('region', np.float64(2.150385164068745)),
 ('region_taxcode', np.float64(2.6311992438560843)),
 ('age', np.float64(4.094164474600083)),
 ('okved', np.float64(9.75769567757136)),
 ('okved_section', np.float64(16.807687340393446)),
 ('okopf', np.float64(2.1402502103896937)),
 ('okogu', np.float64(2.6285645861198317)),
 ('okfc', np.float64(1.7665741239180133)),
 ('line_1230', np.float64(2.1256013516513854)),
 ('line_1250', np.float64(4.698703305943389)),
 ('line_1500', np.float64(5.060779175643321)),
 ('line_1520', np.float64(3.84353333319642)),
 ('line_2110', np.float64(21.944065769618682)),
 ('line_2120', np.float64(8.983426890495931)),
 ('line_2200', np.float64(2.8540658410425177)),
 ('line_2300', np.float64(1.1228428967520583)),
 ('line_2350', np.float64(4.664589004321591)),
 ('line_2400', np.float64(1.6530286821344797)),
 ('line_2500', np.float64(1.0728429282829786))]

In [39]:
preds = model.predict(X_test)

In [40]:
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [41]:
rmse(preds, y_test)

np.float64(53.837434773180384)

In [46]:
sum(preds), sum(y_test)

(np.float64(1699286.8751158288), 1711077.0)

In [43]:
def mean_relative_error(y_true, y_pred):
    return (abs((y_true - y_pred) / y_true)).mean()
    
mean_relative_error(preds, y_test)

np.float64(1.5553954447681675)

In [47]:
model.save_model('workers_count_prediction_model.cbm')