## Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score
from sklearn import preprocessing

In [2]:
SEED = 10

### Считываем тренировочный датасет и данные сотрудников

In [56]:
df_issues_train = pd.read_csv("train_issues.csv")
df_comment_train = pd.read_csv("train_comments.csv")
df_emp = pd.read_csv("employees.csv")

In [58]:
# Для удобства приведем названия задач к нижнему регистру
df_issues_train['summary'] = df_issues_train.apply(lambda x: str(x['summary']).lower(), axis=1)

In [57]:
# Сформируем словарь, в котором ключом является id задачи, а значением - количество комментариев к ней
comments_count_train = df_comment_train.groupby(['issue_id']).count().to_dict()['comment_id']

In [59]:
# Из данных сотрудников удалим лишние поля
df_emp = df_emp.drop(["active", "full_name", "passport"], axis=1)

In [60]:
# Немного нормализуем назания должностей, а также добавим бинарные признаки принадлежности 

df_emp['position_norm'] = df_emp.apply(lambda x: str(x['position']).lower().strip(), axis=1)
df_emp['is_dev'] = df_emp.apply(lambda x: 1 if 'разработчик' in str(x['position']).lower() else 0, axis=1)
df_emp['is_qa'] = df_emp.apply(lambda x: 1 if 'тестировщик' in str(x['position']).lower() else 0, axis=1)
df_emp['is_design'] = df_emp.apply(lambda x: 1 if 'дизайнер' in str(x['position']).lower() else 0, axis=1)
df_emp['is_devops'] = df_emp.apply(lambda x: 1 if 'системный' in str(x['position']).lower() or 'devops' in str(x['position']).lower() else 0, axis=1)

## Рассмотрим датасет поближе

In [31]:
df_issues_train.sort_values('overall_worklogs', ascending=False)[1501:1550]

Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs
7995,823139,2019-09-19 10:06:55.000,FPY-363,сверстать и настроить роутинг в страницах faq,29,1,242,22500
8302,682492,2020-07-07 06:52:30.000,FPY-795,модуль питания,29,291,291,22500
6464,722979,2020-09-14 06:45:28.318,BALT-7975,get rid off phasecommandobject enabled field,13,191,192,22500
2454,728089,2019-11-25 04:42:34.640,BALT-2865,classes list page,13,97,365,22500
202,819347,2019-11-26 13:33:29.000,SM-10990,предложение смены тарифа не работает если прил...,5,94,54,22500
5466,724325,2020-07-28 05:10:21.715,BALT-6629,classes page design,13,149,107,22440
5379,724428,2020-07-22 13:02:01.947,BALT-6526,backend: fetch students + recent students + pe...,13,10,10,22260
2220,728369,2019-10-31 10:18:02.214,BALT-2585,add api bindings for themes and layout (action...,13,208,208,22200
3986,726126,2020-03-31 07:13:08.558,BALT-4828,[markup] dashboard page,13,86,86,22200
230,819277,2019-12-11 07:05:15.000,SM-11025,"поднять службу ""кол-такси""",5,112,82,22200


## Добавим вспомогательные функции

In [72]:
'''
Функция преобразования датасета
'''

def transform_dataset(issues, comments_count):
    train = pd.merge(issues, df_emp, left_on="assignee_id", right_on="id", how='inner')
    train = train.fillna(0)
    train['summary'] = train.apply(lambda x: str(x['summary']).lower(), axis=1)
    
    train['is_comm'] = train.apply(
        lambda x: 1 if 'communicat' in x['summary'] or 'коммуникац' in x['summary'] or 'комуникац' in x['summary'] or 'standup' in x['summary'] or 'all calls' in x['summary'] or 'discussion' in x['summary'] else 0,
        axis=1
    )
    
    train['is_planning'] = train.apply(
        lambda x: 1 if 'planning' in x['summary'] or 'планирование' in x['summary'] or 'оценить' in x['summary'] else 0,
        axis=1
    )    
    
    train['is_bug'] = train.apply(
        lambda x: 1 if 'bug' in x['summary'] or 'incorrect' in x['summary']  or 'fix' in x['summary'] or 'cannot' in x['summary'] or 'исправ' in x['summary'] or 'фикс' in x['summary'] or 'не работает' in x['summary'] or 'проблем' in x['summary'] or 'разобраться' in x['summary'] or 'не вывод' in x['summary'] or 'не показ' in x['summary'] else 0,
        axis=1
    )
    
    train['is_setup'] = train.apply(
        lambda x: 1 if 'jira' in x['summary'] or 'confluence' in x['summary'] else 0,
        axis=1
    )
    
    train['is_onboarding'] = train.apply(
        lambda x: 1 if 'onboard' in x['summary'] else 0,
        axis=1
    )
    
    train['is_testing_task'] = train.apply(
        lambda x: 1 if 'тест' in x['summary'] or 'test' in x['summary'] else 0,
        axis=1
    )
      
    train['is_dev_task'] = train.apply(
        lambda x: 1 if 'develop' in x['summary'] or 'разработ' in x['summary'] or 'изготов' in x['summary'] or 'проектиров' in x['summary'] else 0,
        axis=1
    )
    
    train['is_administration'] = train.apply(
        lambda x: 1 if 'administration' in x['summary'] or 'дежурства админов' in x['summary'] else 0,
        axis=1
    )
        
    train['is_research_task'] = train.apply(
        lambda x: 1 if 'research' in x['summary'] or 'ресерч' in x['summary'] else 0,
        axis=1
    )
    
    train['is_non_coding'] = train.apply(
        lambda x: 1 if 'non-coding' in x['summary'] or 'некод' in x['summary'] else 0,
        axis=1
    )

    train['is_file_task'] = train.apply(
        lambda x: 1 if '.java line' in x['summary'] else 0,
        axis=1
    )
    
    train['is_frontend'] = train.apply(
        lambda x: 1 if 'frontend' in x['summary'] or 'front' in x['summary'] or 'template' in x['summary'] or 'markup' in x['summary'] or 'layout' in x['summary'] or 'верстка' in x['summary'] or ('ui' in x['summary'] and 'test' not in x['summary'] and 'тест' not in x['summary']) or 'styles' in x['summary'] or 'ui-components' in x['summary'] or 'canvas' in x['summary'] or 'svg' in x['summary'] or 'eslint' in x['summary'] else 0,
        axis=1
    )
    
    train['is_backend'] = train.apply(
        lambda x: 1 if 'backend' in x['summary'] or 'integration' in x['summary'] else 0,
        axis=1
    )
    
    train['is_mobile_task'] = train.apply(
        lambda x: 1 if 'android' in x['summary'] or 'mobile' in x['summary'] else 0,
        axis=1
    )
    
    train['is_design_task'] = train.apply(
        lambda x: 1 if 'design' in x['summary'] or 'concept' in x['summary'] or 'ux' in x['summary'] or 'дизайн' in x['summary'] or 'экран' in x['summary'] else 0,
        axis=1
    )
    
    train['is_seo_task'] = train.apply(
        lambda x: 1 if 'seo' in str(x['summary']).lower() else 0,
        axis=1
    )
    
    train['is_api'] = train.apply(
        lambda x: 1 if 'api' in x['summary'] or 'endpoint' in x['summary'] or 'эндпоинт' in x['summary'] else 0,
        axis=1
    )
    
    train['is_refactor'] = train.apply(
        lambda x: 1 if 'refactor' in x['summary'] or 'optimize' in x['summary'] or 'rework' in x['summary'] or 'рефактор' in x['summary'] else 0,
        axis=1
    )
    
    train['is_new_feature'] = train.apply(
        lambda x: 1 if 'creat' in x['summary'] or 'add' in x['summary'] or 'implement' in x['summary'] or 'созд' in x['summary'] or 'разработ' in x['summary'] or 'добав' in x['summary'] or 'изготов' in x['summary'] or 'реализ' in x['summary'] or 'собрать' in x['summary'] else 0,
        axis=1
    )
    
    train['is_update_task'] = train.apply(
        lambda x: 1 if 'update' in x['summary'] or 'change' in x['summary'] or 'adjust' in x['summary'] or 'изменить' in x['summary'] or 'поменять' in x['summary'] or 'корректир' in x['summary'] or 'remove' in x['summary'] or 'удалить' in x['summary'] or 'переписать' in x['summary'] else 0,
        axis=1
    )
    
    train['is_devops_task'] = train.apply(
        lambda x: 1 if 'ci/cd' in x['summary'] or 'pipeline' in x['summary'] or 'branch' in x['summary'] or 'migration' in x['summary'] or 'staging' in x['summary'] or 'logs' in str(x['summary']).lower() or 'uat' in str(x['summary']).lower() or 'deploy' in str(x['summary']).lower() or 'docker' in x['summary'] or 'kuber' in x['summary'] or 'data migration' in x['summary'] or 'поднять' in x['summary'] or 'upgrade' in x['summary'] else 0,
        axis=1
    )
   
    train['is_self_task'] = train.apply(
        lambda x: 1 if x['creator_id']==x['assignee_id'] else 0,
        axis=1
    )

    train['comments'] = train.apply(
        lambda x: comments_count.get(x['id_x'], 0),
        axis=1
    )
    
    return train

In [62]:
'''
Функция формирования файла для отправки
'''
def save_submit_file(df_test, preds):
    df = pd.DataFrame({
        'id': list(df_test['id_x']),
        'overall_worklogs': list(preds)
    })
    
    df.to_csv('submit.csv', index=False)

## Выделим выборки

In [73]:
df_train = transform_dataset(df_issues_train, comments_count_train)

In [74]:
stuff = ['id_x', 'id_y', 'overall_worklogs', 'key', 'created', 'summary', 'position']

In [75]:
X = df_train.drop(stuff, axis = 1)
y = df_train[["overall_worklogs"]]

In [76]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED)

In [77]:
y_train = np.log(y_train)
y_val = np.log(y_val)

## Построение и обучение модели

In [78]:
categorical_features = [
    'project_id', 'creator_id', 'position_norm', 'assignee_id', 'hiring_type', 
    'payment_type', 'english_level', 'salary_calculation_type'
]

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=categorical_features
)
val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=categorical_features
)

In [79]:
model = CatBoostRegressor(
    learning_rate=0.0013,
    iterations=8000,
    depth=8,
    random_seed=SEED,
    verbose=1000,
    l2_leaf_reg=0.02
)

In [936]:
grid = {
    'iterations': [5000, 6000],
    'learning_rate': [0.0015, 0.0017],
    'depth': [8, 9, 10],
    'l2_leaf_reg': [0.2, 0.5, 1, 3]
}

model.grid_search(grid, train_pool)


bestTest = 1.135167575
bestIteration = 4999

0:	loss: 1.1351676	best: 1.1351676 (0)	total: 4m 14s	remaining: 21m 10s

bestTest = 1.122338093
bestIteration = 5999

1:	loss: 1.1223381	best: 1.1223381 (1)	total: 10m 2s	remaining: 20m 4s

bestTest = 1.131813352
bestIteration = 4999

2:	loss: 1.1318134	best: 1.1223381 (1)	total: 16m 32s	remaining: 16m 32s

bestTest = 1.119316679
bestIteration = 5997

3:	loss: 1.1193167	best: 1.1193167 (3)	total: 25m 19s	remaining: 12m 39s

bestTest = 1.129307596
bestIteration = 4999

4:	loss: 1.1293076	best: 1.1193167 (3)	total: 50m 52s	remaining: 10m 10s

bestTest = 1.11697537
bestIteration = 5999

5:	loss: 1.1169754	best: 1.1169754 (5)	total: 1h 25m 3s	remaining: 0us
Estimating final quality...


CatBoostError: Model was fitted before hyperparameters tuning. You can't change hyperparameters of fitted model.

In [937]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 6000,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 2,
 'l2_leaf_reg': 0.20000000298023224,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'max_ctr_complexity': 4,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Co

In [80]:
model.fit(train_pool, use_best_model=True, eval_set=val_pool)

0:	learn: 1.2278355	test: 1.2635757	best: 1.2635757 (0)	total: 55.2ms	remaining: 7m 21s
1000:	learn: 1.1253212	test: 1.1838093	best: 1.1838093 (1000)	total: 1m	remaining: 7m 3s
2000:	learn: 1.0896428	test: 1.1606465	best: 1.1606465 (2000)	total: 2m 19s	remaining: 6m 58s
3000:	learn: 1.0691922	test: 1.1502616	best: 1.1502616 (3000)	total: 3m 40s	remaining: 6m 6s
4000:	learn: 1.0544254	test: 1.1443863	best: 1.1443863 (4000)	total: 4m 52s	remaining: 4m 52s
5000:	learn: 1.0422301	test: 1.1398428	best: 1.1398427 (4999)	total: 5m 59s	remaining: 3m 35s
6000:	learn: 1.0308597	test: 1.1362177	best: 1.1362177 (6000)	total: 7m 19s	remaining: 2m 26s
7000:	learn: 1.0191638	test: 1.1327091	best: 1.1327091 (7000)	total: 8m 30s	remaining: 1m 12s
7999:	learn: 1.0065614	test: 1.1293039	best: 1.1293039 (7999)	total: 9m 39s	remaining: 0us

bestTest = 1.12930387
bestIteration = 7999



<catboost.core.CatBoostRegressor at 0x243797730c8>

In [81]:
# Получим предсказания для тренировочного и валидационного датасета

pred_train = model.predict(X_train)
pred_val = model.predict(X_val)

In [82]:
# Оценим метрику R2

r2_score(y_train, pred_train), r2_score(y_val, pred_val)

(0.2853365554560179, 0.19931869626977516)

In [1074]:
min(np.exp(pred_val)), max(np.exp(pred_val))

(2006.3331830802852, 94115.77524605562)

## Посмотрим на вклад каждого признака

In [91]:
columns = list(X_train.columns)
for index, value in enumerate(model.get_feature_importance()):
    print(columns[index], value, sep=' -> ')

project_id -> 7.845958223011912
assignee_id -> 14.097102156277604
creator_id -> 14.386261994301796
hiring_type -> 3.4951658569934114
payment_type -> 6.917930072715343
salary_calculation_type -> 2.234793989846647
english_level -> 4.125239364145091
is_nda_signed -> 0.3927404594841476
is_labor_contract_signed -> 0.3560178116828472
is_added_to_internal_chats -> 0.4382536182604846
is_added_one_to_one -> 0.16571780636075872
position_norm -> 4.792906988262446
is_dev -> 0.48893835219659243
is_qa -> 0.00046879383002248494
is_design -> 0.028489279288606013
is_devops -> 0.09995255906075394
is_comm -> 5.89783402435394
is_planning -> 1.1480119240819688
is_bug -> 1.5555547507016032
is_setup -> 0.2513450378514521
is_onboarding -> 0.24200080076692718
is_testing_task -> 1.2806867941536386
is_dev_task -> 0.5216808185383759
is_administration -> 0.2128352708829292
is_research_task -> 0.5932977652793798
is_non_coding -> 0.052199909897528755
is_file_task -> 0.022265753302683277
is_frontend -> 3.452205676546

## Считываем и преобразуем тестовый датасет

In [83]:
df_test = pd.read_csv('test_issues.csv')
df_comment_test = pd.read_csv('test_comments.csv')

In [84]:
comments_count_test = df_comment_test.groupby(['issue_id']).count().to_dict()['comment_id']

In [85]:
df_test = transform_dataset(df_test, comments_count_test)

In [86]:
stuff = ['id_x', 'id_y', 'key', 'created', 'summary', 'position']

In [87]:
X_test = df_test.drop(stuff, axis = 1)

## Получаем предсказания и готовим файл для отправки

In [88]:
pred_test = model.predict(X_test)

In [89]:
# Преобразуем предсказания: вычисляем экспоненту, округляем до минут и часов 
modified_preds = list(map(lambda x: np.ceil(x / 60) * 60, np.round(np.exp(pred_test))))

In [90]:
save_submit_file(df_test, modified_preds)