In [6]:
import numpy as np
import pandas as pd
import random,os
import sys
from sklearn.preprocessing import StandardScaler, RobustScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from IPython.display import display

In [13]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything()
pd.set_option('display.max_columns', None)

#загрузка датасета
train=pd.read_csv('covid_data_train.csv')
test=pd.read_csv('covid_data_test.csv')
test['Unnamed: 1']=test['Unnamed: 0']
test.set_index('Unnamed: 1',inplace=True)

nans = train[train.population.isna()]
train = train.drop(nans.index,axis=0)

In [14]:
columns_to_drop_70 = [] #список столбцов, в которых содержание NaN >= 70%
for c in train.columns:
    if train[c].isna().sum()/len(train)>=0.7:
        columns_to_drop_70.append(c)


categoricals = ['district', 'subject']

ivls_ekmos = ['ivl_per_100k', 'ivl_number', 'ekmo_per_100k', 'ekmo_number']

tuberculs = ['num_patients_tubercul_1992','num_patients_tubercul_1993','num_patients_tubercul_1994','num_patients_tubercul_1995',
           'num_patients_tubercul_1996','num_patients_tubercul_1997','num_patients_tubercul_1998','num_patients_tubercul_1999',
           'num_patients_tubercul_2000','num_patients_tubercul_2001','num_patients_tubercul_2002','num_patients_tubercul_2003',
           'num_patients_tubercul_2004','num_patients_tubercul_2005','num_patients_tubercul_2006','num_patients_tubercul_2007',
           'num_patients_tubercul_2008','num_patients_tubercul_2009','num_patients_tubercul_2010','num_patients_tubercul_2011',
           'num_patients_tubercul_2012','num_patients_tubercul_2013','num_patients_tubercul_2014','num_patients_tubercul_2015',
           'num_patients_tubercul_2016','num_patients_tubercul_2017']

transports = ['epirank_bus','epirank_train','epirank_bus_cat','epirank_train_cat','epirank_bus',
            'epirank_train','epirank_bus_cat','epirank_train_cat']

subjects = ['Хакасия', 'Оренбургская область', 'Краснодарский край', 'Татарстан', 'Ростовская область', 'Свердловская область', 
          'Чувашия', 'Якутия', 'Алтайский край', 'Владимирская область', 'Пермский край', 'Белгородская область', 'Тульская область', 
          'Иркутская область', 'Крым', 'Чукотский АО', 'Тверская область', 'Кемеровская область', 'Мурманская область', 'Чечня', 
          'Мордовия', 'Нижегородская область', 'Саратовская область', 'Приморский край', 'Красноярский край', 'Архангельская область', 
          'Томская область', 'Астраханская область', 'Челябинская область', 'Вологодская область', 'Бурятия', 'Кабардино-Балкария', 
          'Калужская область', 'Московская область', 'Забайкальский край', 'Новосибирская область', 'Ульяновская область', 
          'Пензенская область', 'Амурская область', 'Карелия', 'Башкортостан', 'Ханты-Мансийский АО — Югра', 'Северная Осетия — Алания', 
          'Хабаровский край', 'Еврейская АО', 'Ставропольский край', 'Воронежская область', 'Ленинградская область', 'Орловская область', 
          'Новгородская область', 'Брянская область', 'Костромская область', 'Смоленская область', 'Псковская область', 'Ивановская область', 
          'Волгоградская область', 'Марий Эл', 'Коми', 'Удмуртия', 'Кировская область', 'Ярославская область', 'Алтай', 'Калмыкия', 
          'Липецкая область', 'Ямало-Ненецкий АО', 'Калининградская область', 'Курганская область', 'Дагестан', 'Сахалинская область', 
          'Курская область', 'Тамбовская область', 'Самарская область', 'Тюменская область', 'Омская область', 'Рязанская область', 
          'Тыва', 'Магаданская область', 'Адыгея', 'Москва', 'Ингушетия', 'Ненецкий АО', 'Камчатский край', 'Санкт-Петербург', 'Севастополь']

In [15]:
#заполнение NaN нулями, удаление ненужных столбцов
def prepare_and_clean_data(df):
    df = df.drop(['region_x','Unnamed: 0'], axis=1)
    df['has_metro'] = df['has_metro'].fillna(0)
    df = df.drop(columns_to_drop_70, axis=1)
    df[ivls_ekmos] = df[ivls_ekmos].fillna(0)
    df = df.drop(transports, axis=1)

    return df

#'умное' заполнение пустых значений для признаков, связанных с туберкулезом
def smart_fillna_for_tubercul(train, test):
    tmp = pd.concat([train, test])

    fill_vals = pd.DataFrame(tmp.groupby('name')[tuberculs])
    fill_vals_dict = {}

    for i in range(len(fill_vals)):
        fill_vals_dict[fill_vals[0][i]] = np.nanmean(fill_vals[1][i]) #np.nanmean(fill_vals[1][i][tuberculs]) for pandas==1.3

    for i in tmp.index:
        tmp.loc[i,tuberculs] = tmp.loc[i,tuberculs].fillna(fill_vals_dict[tmp.loc[i,'name']])
      
    tmp = tmp.drop('name',axis=1)

    for i in tmp.columns:
        if tmp[i].isna().sum()>0 and i not in categoricals:
            tmp[i] = tmp[i].fillna(np.nanmean(tmp[i]))
    
    return tmp[:len(train)], tmp[len(train):]

#создание новых признаков
def making_features(df):
    df['avg_temp_mul_hum'] = df['humidity_max'] / df['avg_temp_max']
    return df

#нормализация данных
def scaling(train, test):
    count_var = []

    tmp = pd.concat([train, test])
    for col in tmp.columns:
        if tmp[col].dtype != 'object':
            count_var.append(col)

    df_tmp = tmp[(count_var)]
    std_scaler = StandardScaler()
    std_scaler.fit(df_tmp)

    scaled_train = std_scaler.transform(train[count_var])
    scaled_test = std_scaler.transform(test[count_var])

    count_var_col = []
    for col_name in count_var:
        count_var_col.append(col_name + '_std')

    train[count_var_col] = scaled_train
    test[count_var_col]  = scaled_test

    train = train.drop(count_var, axis=1)
    test = test.drop(count_var, axis=1)

    return train, test

In [16]:
train = prepare_and_clean_data(train)
test = prepare_and_clean_data(test)

y = train['inf_rate']
train = train.drop('inf_rate', axis=1)
test = test.drop('inf_rate', axis=1)

train, test = smart_fillna_for_tubercul(train, test)

train = making_features(train)
test = making_features(test)

train, test = scaling(train, test)

tmp = pd.concat([train, test])
tmp = pd.get_dummies(tmp)

train, test = tmp[:len(train)], tmp[len(train):]

  self[col] = igetitem(value, i)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=42, shuffle=False)

#модель для экспериментов
cb_for_valid = CatBoostRegressor(iterations=13000, eval_metric='MAE', random_seed=42, random_strength=0.6, learning_rate=0.007)
cb_for_valid.fit(X_train, y_train, verbose=500, eval_set=(X_val, y_val))
print(mean_absolute_error(y_val, cb_for_valid.predict(X_val)))

0:	learn: 0.7492103	test: 0.6708399	best: 0.6708399 (0)	total: 28.3ms	remaining: 6m 8s
500:	learn: 0.1303369	test: 0.1057503	best: 0.1057503 (500)	total: 7.99s	remaining: 3m 19s
1000:	learn: 0.0439168	test: 0.0339506	best: 0.0339506 (1000)	total: 16.2s	remaining: 3m 13s
1500:	learn: 0.0254615	test: 0.0173108	best: 0.0173108 (1500)	total: 23.4s	remaining: 2m 59s
2000:	learn: 0.0183606	test: 0.0096541	best: 0.0096541 (2000)	total: 31.3s	remaining: 2m 52s
2500:	learn: 0.0153688	test: 0.0064675	best: 0.0064675 (2500)	total: 38.7s	remaining: 2m 42s
3000:	learn: 0.0137961	test: 0.0050295	best: 0.0050295 (3000)	total: 42.6s	remaining: 2m 21s
3500:	learn: 0.0131416	test: 0.0044286	best: 0.0044286 (3500)	total: 46.4s	remaining: 2m 5s
4000:	learn: 0.0127255	test: 0.0040999	best: 0.0040999 (4000)	total: 50.2s	remaining: 1m 52s
4500:	learn: 0.0124604	test: 0.0039164	best: 0.0039164 (4500)	total: 53.9s	remaining: 1m 41s
5000:	learn: 0.0122661	test: 0.0037829	best: 0.0037829 (5000)	total: 57.7s	rema

In [None]:
#важность признаков
imp = cb_for_valid.get_feature_importance(prettified=True).set_index('Feature Id')
display(imp)

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
wind_speed_ms_std_std,8.189217
volume_serv_tourism_2017_std,4.180883
volume_serv_others_2017_std,3.918051
work_ratio_15-72_years_std,3.590307
volume_serv_housing_2017_std,3.570692
...,...
subject_Ингушетия,0.000000
subject_Ненецкий АО,0.000000
subject_Камчатский край,0.000000
subject_Санкт-Петербург,0.000000


In [18]:
#обучение на полном датасете финальной модели для прода
final_cb = CatBoostRegressor(iterations=13000, random_seed=42, random_strength=0.6, learning_rate=0.007)
final_cb.fit(train, y, verbose=500)

0:	learn: 0.9248008	total: 58ms	remaining: 12m 33s
500:	learn: 0.1565610	total: 6.35s	remaining: 2m 38s
1000:	learn: 0.0822574	total: 10.1s	remaining: 2m 1s
1500:	learn: 0.0703916	total: 13.9s	remaining: 1m 46s
2000:	learn: 0.0678446	total: 17.7s	remaining: 1m 37s
2500:	learn: 0.0673603	total: 21.5s	remaining: 1m 30s
3000:	learn: 0.0672623	total: 25.3s	remaining: 1m 24s
3500:	learn: 0.0672293	total: 29s	remaining: 1m 18s
4000:	learn: 0.0672183	total: 32.8s	remaining: 1m 13s
4500:	learn: 0.0672138	total: 36.6s	remaining: 1m 9s
5000:	learn: 0.0672107	total: 40.4s	remaining: 1m 4s
5500:	learn: 0.0672092	total: 44.1s	remaining: 1m
6000:	learn: 0.0672081	total: 47.9s	remaining: 55.9s
6500:	learn: 0.0672073	total: 51.7s	remaining: 51.7s
7000:	learn: 0.0672067	total: 55.5s	remaining: 47.6s
7500:	learn: 0.0672064	total: 59.3s	remaining: 43.5s
8000:	learn: 0.0672061	total: 1m 3s	remaining: 39.4s
8500:	learn: 0.0672059	total: 1m 6s	remaining: 35.4s
9000:	learn: 0.0672058	total: 1m 12s	remaining:

<catboost.core.CatBoostRegressor at 0x7f2e399a0f50>

In [21]:
#предсказания 
test_preds = final_cb.predict(test)
sub = pd.DataFrame({'Unnamed: 0': test.index, 'inf_rate': test_preds})
print(sub[:5])

sub.to_csv('FSD.csv', index=False)
final_cb.save_model('final_model')

   Unnamed: 0  inf_rate
0         451  2.708254
1         452  0.693207
2         453  0.693946
3         454  1.609505
4         455  1.386796
