In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split

In [82]:
train = pd.read_csv('data/bank/train.csv')
train.fillna('-', inplace=True)

train.columns

Index(['_ID_', '_TYPE_', '_DATE_', '_ADDRESS_', '_PRICE_', '_AREA_', '_METRO_',
       '_LAT_', '_LON_', '_DESC_'],
      dtype='object')

In [83]:
test = pd.read_csv('data/bank/test.csv')
test.fillna('-', inplace=True)

test.columns

Index(['_ID_', '_CITY_', '_ADRS_', '_OBJ_TYPE_', '_TTL_S_', '_F1_S_', '_F1_U_',
       '_FC_S_', '_FC_U_', '_F0_S_', '_F0_U_', '_FA_S_', '_FA_U_', '_F2_S_',
       '_F2_U_', '_F3_S_', '_F3_U_', '_AREA_', '_CHARACT_', '_LINE_',
       '_METRO_', '_ROUND_', '_FOOT_TRAF_', '_IS_PRKNG_', '_IS_WIN_',
       '_IS_SEP_ENT_', '_IS_VENT_', '_DECOR_', '_IS_COM_', '_F1_H_', '_DATE_'],
      dtype='object')

Удаляем строки из train, где нет указана цена. Остальные переводим в тысячи рублей.

In [84]:
train = train[[(x.find('РУБ') != -1) for x in train['_PRICE_']]]

train['_PRICE_'] = pd.Series([int(''.join(x[:x.find('РУБ')].split())) / 10000 
                             for x in train['_PRICE_']], train.index)
len(train)

15756

Удаляем строкиз из train, где не указана площадь. Остальные переводим в тысячи рублей.

In [85]:
train = train[[x.find('м') != -1 for x in train['_AREA_']]]
train['_AREA_'] = pd.Series([float(x[:x.find('м')].replace(',', '.')) 
                             for x in train['_AREA_']],
                            train.index)
len(train)

15741

Преобразование тестовых данных.

In [86]:
test['_TTL_S_'] = pd.Series([float(x.replace(',', '.')) if x[0].isdigit() else -1
                             for x in test['_TTL_S_']],
                            test.index)

In [87]:
input_X = [[x] if x != -1 else [np.mean(input_X)] for x in train['_AREA_']]
input_y = train['_PRICE_']
print(len(train_X), len(train_y))

11018 11018


In [88]:
output_X = [[x] for x in test['_TTL_S_']]

In [89]:
def fit(X, y):
    rf_est = GridSearchCV(RandomForestRegressor(),
                          param_grid={'n_estimators': [i for i in range(30, 301, 30)]})
    rf_est.fit(X, y)
    gb_est = GridSearchCV(GradientBoostingRegressor(),
                          param_grid={'n_estimators': [i for i in range(50, 261, 30)]})
    gb_est.fit([x + [y] for x, y in zip(X, rf_est.predict(X))], y)
    return rf_est, gb_est

In [90]:
train_X, test_X, train_y, test_y = train_test_split(input_X, input_y, test_size=0.3)
rf_est, gb_est = fit(train_X, train_y)

In [91]:
test_X = [x + [y] for x, y in zip(test_X, rf_est.predict(test_X))]
train_X = [x + [y] for x, y in zip(train_X, rf_est.predict(train_X))]

In [92]:
print(np.sqrt(np.mean(np.power(np.log2(gb_est.predict(test_X)) - np.log2(test_y), 2))))
print(np.sqrt(np.mean(np.power(np.log2(gb_est.predict(train_X)) - np.log2(train_y), 2))))

2.30134781618
1.98683132496


In [93]:
output_X = [x + [y] for x, y in zip(output_X, rf_est.predict(output_X))]
output_y = gb_est.predict(output_X)
prediction = pd.DataFrame(data={'_ID_': test['_ID_'], '_PRICE_': output_y})
prediction.to_csv('data/bank/prediction.csv', index=False)
print(prediction.head())

   _ID_      _PRICE_
0     0   538.611454
1     1  1400.539347
2     2   910.431439
3     3   572.659288
4     4   682.265474
