### Imports

In [1]:
import numpy as np
import pandas as pd
from category_encoders import CountEncoder
import lightgbm as lgb
from scipy.stats import gaussian_kde
np.random.seed(42)

### Constants

In [2]:
key_cols = ['id', 'date', 'target', 'log_target', 'price_type']
cat_cols = ['city', 'region', 'realty_type', 'street', 'floor', 'osm_city_nearest_name']

### Main

In [3]:
test = pd.read_csv('../data/test.csv').rename({'per_square_meter_price': 'target'}, axis=1)
test['target'] = 0
test['log_target'] = 0
train = pd.read_csv('../data/train.csv').rename({'per_square_meter_price': 'target'}, axis=1)
train = train[(train.city.isin(set(test.city))) | (train.price_type == 1)].reset_index(drop=True)
train['log_target'] = np.log(train['target'])

train.city = train.city.str.replace('ё', 'е')
test.city = test.city.str.replace('ё', 'е')

train.osm_city_nearest_name = train.osm_city_nearest_name.str.replace('ё', 'е')
test.osm_city_nearest_name = test.osm_city_nearest_name.str.replace('ё', 'е')

### Categories

In [4]:
for col in cat_cols:
    train['for_cnt_cat_' + col] = train[col].copy()
    test['for_cnt_cat_' + col] = test[col].copy()
ce = CountEncoder(cols=['for_cnt_cat_' + x for x in cat_cols])
train = ce.fit_transform(train)
test = ce.transform(test)

### Model

###### Work with train dataset

In [5]:
def get_sampled_train(train_1, train_0, sample_fraq=0.5):
    choice_idx = np.random.choice(range(train_0.shape[0]), size=int(train_0.shape[0] / 2),
                                  replace=False, p=probs / sum(probs))
    train_0 = train_0.iloc[choice_idx].reset_index(drop=True)
    return pd.concat([train_0, train_1]).reset_index(drop=True)

train_0 = train[train.price_type == 0].reset_index(drop=True)
train_1 = train[train.price_type == 1].reset_index(drop=True)
train_0['log_target'] = train_0['log_target'] * train_1.log_target.mean() / train_0.log_target.mean()
kde = gaussian_kde(train_1.log_target)
probs = kde(train_0.log_target)

###### Train

In [6]:
list_of_final_tables = []
for i in range(25):
    current_train = get_sampled_train(train_1, train_0)
    train_cols = current_train.columns.difference(key_cols + cat_cols)
    train_data = lgb.Dataset(current_train[train_cols], current_train['log_target'])

    final_table = pd.DataFrame()
    for i in range(10):
        lgbm_params = {'metric': 'mape',
                       'objective': 'mse',
                       'learning_rate': 0.05,
                       'n_jobs': 12,
                       'seed': i,
                       'feature_fraction': 0.8}
        lgbm = lgb.train(lgbm_params, train_data, num_boost_round=3100, verbose_eval=-1)
        final_table['pred_' + str(i)] = lgbm.predict(test[train_cols])
    list_of_final_tables.append(final_table)

### Submission

In [7]:
final_df = pd.DataFrame()
for i in range(len(list_of_final_tables)):
    final_df['pred_model_' + str(i)] = list_of_final_tables[i].mean(1)

submission = pd.read_csv('../data/test_submission.csv')
preds = np.exp(final_df.mean(1))
submission['per_square_meter_price'] = preds
submission[['id', 'per_square_meter_price']].to_csv('./sampled_25_models_blend_10_city_to_osm.csv', index=False)

_____

_____

_____

_____

_____

_____

_____

_____

_____

_____