In [843]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2

pd.options.mode.chained_assignment = None

In [844]:
data = pd.read_csv('train.csv')

train, valid = train_test_split(data, test_size=0.27, random_state=42)

### Подготовка данных для обучения модели

In [845]:
train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,7300.0,7300.0,7300.0,7300.0,5746.0,7300.0,7300.0,7300.0,7300.0,7300.0,7300.0,7300.0,7300.0,3785.0,7300.0,7300.0,7300.0
mean,8374.583836,50.532603,1.88863,56.286578,37.459112,6.443699,8.504521,12.623151,4731.884,0.117534,24.719178,5356.172329,8.070959,1132.45284,1.307397,4.25863,214168.702444
std,4859.222359,43.731954,0.840675,20.415394,100.045376,33.302814,5.220509,6.842145,234667.8,0.118379,17.584019,4008.200814,23.925105,1016.76975,1.48575,4.820887,92501.503024
min,0.0,0.0,0.0,2.377248,0.370619,0.0,1.0,0.0,1912.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,59174.778028
25%,4170.25,19.0,1.0,41.745094,22.80189,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,325.0,0.0,1.0,154466.5912
50%,8380.5,36.0,2.0,52.590935,32.80619,6.0,7.0,13.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0,192359.511502
75%,12542.75,75.0,2.0,65.981105,45.089597,9.0,12.0,17.0,2001.0,0.194489,36.0,7287.0,5.0,1548.0,2.0,6.0,249053.269789
max,16796.0,209.0,19.0,604.705972,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,627525.072788


### Исправление явных ошибок

In [846]:
train.loc[train['HouseYear'] > 2020, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
4189,11607,147,2.0,44.791836,28.360393,5.0,4,9.0,4968,0.319809,B,B,25,4756,16,2857.0,5,8,B,243028.603096
1497,10814,109,1.0,37.26507,20.239714,9.0,9,12.0,20052011,0.13633,B,B,30,6141,10,262.0,3,6,B,254084.534396


In [847]:
train.loc[train['HouseYear'] == 4968, 'HouseYear'] = 1968
train.loc[train['HouseYear'] == 20052011, 'HouseYear'] = 2008

In [848]:
train.loc[train['Id'].isin([28,2307]), 'Square'] = train['Square']/10

In [849]:
def clean_data(df):
    df.loc[(df['LifeSquare'] < 1), 'LifeSquare'] = df['LifeSquare']*100
    df.loc[(df['Square'] < 1), 'Square'] = df['Square']*100
    df.loc[(df['LifeSquare'] <= 10), 'LifeSquare'] = df['LifeSquare']*10
    df.loc[(df['Square'] <= 10), 'Square'] = df['Square']*10
    df.loc[(df['Rooms'] == 0) & (df['Square'] < 40 ), 'Rooms'] = 1
    df.loc[(df['Rooms'] == 0) & (df['Square'] >= 40 ) & (df['Square'] < 90), 'Rooms'] = 2
    df.loc[(df['Rooms'] == 0) & (df['Square'] >= 90 ) & (df['Square'] < 140), 'Rooms'] = 3
    df.loc[(df['Rooms'] > 4) & (df['Square'] < 40 ), 'Rooms'] = 1
    df.loc[(df['Rooms'] > 4) & (df['Square'] >= 40 ) & (df['Square'] < 90), 'Rooms'] = 2
    df.loc[(df['Rooms'] > 4) & (df['Square'] >= 90 ) & (df['Square'] < 140), 'Rooms'] = 3
    df.loc[(df['Square'] < 30) & (df['Rooms'] >= 3), 'Rooms'] = 1
    df.loc[df['Square'] < df['LifeSquare'], 'LifeSquare'] = df['Square']
    df.loc[:, 'LifeSquare'] = df['Square'].fillna(df['Square'])
    return df

### Добавление признаков, вычисленных по исходным данным

In [850]:
def add_features(df):
    # размер района
    district = df.groupby(['DistrictId', 'Ecology_2', 'Ecology_3', 'Shops_2'], as_index=False)['Square'].mean().rename(columns={'Square': 'district_shape'})
    df = pd.merge(df, district, on=['DistrictId', 'Ecology_2', 'Ecology_3', 'Shops_2'], how='left')

    m_price = df.groupby(['DistrictId', 'Square', 'Ecology_2', 'Ecology_3', 'Shops_2'], as_index=False)['Price'].mean().rename(columns={'Price': 'mean_price'})
    df = pd.merge(df, m_price, on=['DistrictId', 'Square', 'Ecology_2', 'Ecology_3', 'Shops_2'], how='left')

    df['m_price'] = df['Price']/df['Square']

    mean_m_price = df.groupby(['DistrictId', 'Ecology_2', 'Ecology_3', 'Shops_2'], as_index=False)['m_price'].mean().rename(columns={'m_price': 'mean_m_price'})
    df = pd.merge(df, mean_m_price, on=['DistrictId', 'Ecology_2', 'Ecology_3', 'Shops_2'], how='left')

    return df

### Построение модели

In [851]:
model = RF(n_estimators=250, max_depth=15, random_state=42)

In [852]:
# Чистим данные, добавляем признаки
train = clean_data(train)
train = add_features(train)

In [853]:
features = ['DistrictId','Rooms','Square','LifeSquare','HouseYear','Ecology_1','Social_1','Social_2',
            'Social_3','Helthcare_2','Shops_1']

In [854]:
model.fit(train.loc[:, features], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

### Проверка

In [855]:
def get_prediction(model, df, features=features, evaluate='Yes'):
    pred = model.predict(df.loc[:, features])
    if evaluate == 'Yes':
        r2_value = r2(df['Price'], pred)
        print('R2: {}'.format(r2_value))
    return pred

In [856]:
pred_train = get_prediction(model=model, df=train)

R2: 0.9366299753400765


In [857]:
valid = clean_data(valid)
valid = add_features(valid)

In [858]:
pred_valid = get_prediction(model=model, df=valid)

R2: 0.7116106277742006


### Предсказание цен

In [859]:
test_data = pd.read_csv('test.csv')

In [860]:
test_data.shape

(5000, 19)

In [861]:
test_data = clean_data(test_data)

In [862]:
test_data['Price'] = model.predict(test_data.loc[:, features])

In [863]:
test_data['Price'].describe()

count      5000.000000
mean     215817.751805
std       80632.119021
min       63841.875914
25%      163944.476304
50%      195602.879435
75%      250123.606760
max      576086.222746
Name: Price, dtype: float64

In [864]:
test_data.loc[:, ['Id', 'Price']].to_csv('ASidoryuk_predictions.csv', index=None)