In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as MSE, r2_score as r2
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

<H1>Работа с обучающим датасетом</H1>

In [3]:
df=pd.read_csv('train.csv')

In [4]:
data=df.copy()

<h2>Предобработка данных</h2>

In [5]:
data = data.drop(['Healthcare_1'], axis = 1)

In [6]:
data = pd.get_dummies(data)

<H3> Год строительства </H3>

Исправление ошибок в данных

In [7]:
data.loc[data['HouseYear'] > 2021]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
1497,10814,109,1.0,37.26507,20.239714,9.0,9,12.0,20052011,0.13633,...,10,3,6,254084.534396,0,1,0,1,0,1
4189,11607,147,2.0,44.791836,28.360393,5.0,4,9.0,4968,0.319809,...,16,5,8,243028.603096,0,1,0,1,0,1


In [8]:
data.loc[data.HouseYear == 20052011,'HouseYear'] = 2011

In [9]:
data.loc[data.HouseYear == 4968,'HouseYear'] = 1968

Новый признак: время застройки района

In [10]:
dist_mean_year = data.groupby(['DistrictId'])[['HouseYear']].mean().astype('int32').reset_index().rename(columns = {'HouseYear': 'Dist_mean_year'})
data = data.merge(dist_mean_year, on = ('DistrictId'), how ='left')

Новый признак: новостройка

In [11]:
data['New_House'] = (data['HouseYear'] >= 2018).astype('int32')

<H3> Площадь </H3>

Исправление ошибок в данных

In [12]:
data.loc[data.Rooms>5]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
377,5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,...,15,317265.323792,0,1,0,1,0,1,1989,0
1454,8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,...,1,78364.616704,0,1,0,1,0,1,1985,0
2170,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,...,8,229661.964416,0,1,0,1,0,1,1976,0
8849,14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,...,4,172329.270863,0,1,0,1,0,1,1988,0


In [13]:
data.loc[1454,'Rooms'] = 1

In [14]:
data.loc[[377,2170,8849],'Rooms'] = 2

In [15]:
data.loc[data.Rooms == 0]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
1397,12638,27,0.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,...,0,268394.744389,0,1,0,1,0,1,1994,0
1981,7917,27,0.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,...,1,302211.260887,0,1,0,1,0,1,1994,0
2269,7317,27,0.0,41.790881,,0.0,13,0.0,1977,0.211401,...,1,98129.976788,0,1,0,1,0,1,1994,0
3911,770,28,0.0,49.483501,,0.0,16,0.0,2015,0.118537,...,0,217009.338463,0,1,0,1,0,1,1981,0
4366,456,6,0.0,81.491446,,0.0,4,0.0,1977,0.243205,...,0,212864.799112,0,1,0,1,0,1,1990,0
4853,3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,...,0,126596.941798,0,1,0,1,0,1,1994,0
6149,3159,88,0.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,...,9,158998.110646,0,1,0,1,0,1,1982,0
8834,9443,27,0.0,87.762616,85.125471,0.0,5,15.0,1977,0.211401,...,1,219281.918007,0,1,0,1,0,1,1994,0


In [16]:
data.loc[[1397,1981],'Rooms'] = 4
data.loc[[8834],'Rooms'] = 3
data.loc[[4366],'Rooms'] = 2
data.loc[[2269, 3911, 6149],'Rooms'] = 1

In [17]:
data.loc[data.Rooms == 0]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
4853,3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,...,0,126596.941798,0,1,0,1,0,1,1994,0


In [18]:
data.drop(4853, inplace=True)

In [19]:
data.loc[data.LifeSquare>200]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
590,14990,23,2.0,48.449873,263.54202,5.0,6,5.0,1972,0.075779,...,2,141780.231857,0,1,0,1,0,1,1987,0
1981,7917,27,4.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,...,1,302211.260887,0,1,0,1,0,1,1994,0
1982,5548,86,5.0,275.645284,233.949309,26.0,12,37.0,2011,0.161976,...,7,455264.882666,0,1,1,0,0,1,1975,0
4328,16550,27,3.0,81.694417,7480.592129,1.0,9,17.0,2016,0.017647,...,0,217357.492366,0,1,0,1,0,1,1994,0
4690,2307,102,1.0,409.425181,410.639749,10.0,4,4.0,2016,0.238617,...,6,90470.43083,0,1,0,1,0,1,1985,0
6977,11602,30,2.0,641.065193,638.163193,10.0,20,19.0,2019,7.8e-05,...,23,133529.681562,0,1,0,1,0,1,1992,1
8437,15886,85,3.0,78.059331,461.463614,10.0,12,16.0,1998,0.037178,...,7,394253.299978,0,1,0,1,0,1,1980,0
9910,16568,27,4.0,200.334539,201.627361,25.0,1,2.0,2013,0.041116,...,4,528560.506016,0,1,0,1,0,1,1994,0


In [20]:
data.drop(data[data.LifeSquare > 300].index, inplace=True)

In [21]:
data.loc[(data.LifeSquare - data.Square > 5)] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
212,1748,88,2.0,5.497061,67.628717,1.0,24,22.0,1977,0.127376,...,9,412511.088764,0,1,0,1,0,1,1982,0
590,14990,23,2.0,48.449873,263.54202,5.0,6,5.0,1972,0.075779,...,2,141780.231857,0,1,0,1,0,1,1987,0
3280,10527,27,1.0,4.380726,40.805837,1.0,10,17.0,2013,0.211401,...,1,97560.720383,0,1,0,1,0,1,1994,0
4638,4071,6,3.0,75.203314,82.486992,1.0,13,25.0,2015,0.243205,...,0,211324.917957,0,1,0,1,0,1,1990,0
6332,8961,27,1.0,33.398983,164.15336,6.0,3,5.0,1965,0.211401,...,1,104891.073757,0,1,0,1,0,1,1994,0
8030,13265,1,3.0,4.823679,79.767964,0.0,6,17.0,1977,0.007122,...,1,237716.681261,0,1,0,1,0,1,1985,0


In [22]:
data.loc[[590],'LifeSquare']=26.354
data.loc[[3280],'Square']=43.807
data.loc[[6332],'LifeSquare']=16.415

In [23]:
data.drop(data[data.LifeSquare - data.Square > 5].index, inplace=True)

In [24]:
data.loc[(data.Square<15)|(data.Square>300)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
1316,11526,27,1.0,4.633498,1.969969,1.0,18,1.0,1977,0.011654,...,0,107604.269441,0,1,0,1,0,1,1994,0
1608,10202,6,1.0,2.596351,4.604943,1.0,3,25.0,2014,0.243205,...,0,137597.601458,0,1,0,1,0,1,1990,0
3413,9487,5,1.0,5.129222,5.549458,1.0,1,1.0,1977,0.150818,...,5,369472.403061,0,1,0,1,0,1,1979,0
4262,28,9,2.0,604.705972,,1.0,17,18.0,1977,0.161532,...,4,187717.242538,0,1,0,1,0,1,1988,0
4739,12676,81,3.0,13.784865,15.988889,7.0,4,5.0,1960,0.319809,...,8,78388.806186,0,1,0,1,0,1,1972,0
4900,4504,27,3.0,4.390331,5.610772,1.0,8,19.0,2016,0.211401,...,1,161379.067034,0,1,0,1,0,1,1994,0
6392,14786,1,1.0,1.136859,4.525736,1.0,3,1.0,1977,0.007122,...,1,181434.825589,0,1,0,1,0,1,1985,0
8283,15744,34,1.0,1.988943,2.642219,1.0,21,4.0,1977,0.069753,...,11,458378.777006,0,1,0,1,0,1,2002,0
9294,6782,45,1.0,2.954309,5.257278,1.0,3,1.0,1977,0.195781,...,2,438005.182323,0,1,0,1,0,1,1988,0
9613,7657,38,1.0,4.502784,4.618547,1.0,28,1.0,1977,0.060753,...,7,483283.488083,0,1,0,1,0,1,2000,0


In [25]:
data.loc[[1316],'Square']=46.334
data.loc[[1608],'Square']=25.963
data.loc[[4262],'Square']=60.705
data.loc[[4900],'Square']=43.903

data.drop(3414, inplace=True)
data.drop(6392, inplace=True)
data.drop(8283, inplace=True)
data.drop(9294, inplace=True)
data.drop(9613, inplace=True)

In [26]:
data.loc[(data.Square<15)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
3413,9487,5,1.0,5.129222,5.549458,1.0,1,1.0,1977,0.150818,...,5,369472.403061,0,1,0,1,0,1,1979,0
4739,12676,81,3.0,13.784865,15.988889,7.0,4,5.0,1960,0.319809,...,8,78388.806186,0,1,0,1,0,1,1972,0


In [27]:
data.drop(3413, inplace=True)
data.drop(4739, inplace=True)

In [28]:
# square_lifesquare=(data.Square/data.LifeSquare).mean()
# square_lifesquare

In [29]:
# def fillna_lifesquare(df, df_source):
#     df['LifeSquare']=df['LifeSquare'].fillna(df_source['Square']/square_lifesquare)
#     return df

<H3>Этажность</H3>

Исправление ошибок

In [30]:
data.loc[data.Floor>data.HouseFloor,'HouseFloor']=data.loc[data.Floor>data.HouseFloor,'Floor']

Новый признак: первый и последний этаж

In [31]:
data['frs_lst_floor'] = (data['Floor'] == data['HouseFloor']) | (data['Floor'] <= 1)
data['frs_lst_floor'] = data['frs_lst_floor'].astype('int')

<H2>Обучение </H2>

Определение факторов для построения модели

In [32]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B',
       'Dist_mean_year', 'New_House', 'frs_lst_floor'],
      dtype='object')

In [33]:
features = [
#     'Id', 
    'DistrictId', 
#     'Rooms', 
    'Square', 
#     'LifeSquare', 
#     'KitchenSquare',
    'Floor', 
    'HouseFloor', 
    'HouseYear', 
    'Ecology_1', 
    'Social_1', 
    'Social_2',
    'Social_3', 
    'Helthcare_2', 
    'Shops_1', 
#     'Price', 
    'Ecology_2_A',
    'Ecology_2_B',
    'Ecology_3_A', 
    'Ecology_3_B', 
    'Shops_2_A', 
    'Shops_2_B',
    'Dist_mean_year', 
    'New_House', 
    'frs_lst_floor']

Формирование наборов данных

In [34]:
X = data[features]

In [35]:
y = data['Price']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Нормализация данных

In [37]:
scaler = StandardScaler()

In [38]:
X_train_scaled = scaler.fit_transform(X_train)

In [39]:
X_test_scaled = scaler.transform(X_test)

In [40]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = features)

In [41]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns = features)

Обучение модели

In [42]:
model = RF(n_estimators = 2000,  max_depth = 8, random_state = 42, n_jobs = 8)

In [43]:
model.fit(X_train_scaled, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=8,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [44]:
pred_train = model.predict(X_train_scaled)

In [45]:
r2(y_train, pred_train)

0.79455403349488

Проверка

In [46]:
pred_test = model.predict(X_test_scaled)

In [47]:
r2(y_test, pred_test)

0.7253076007903596

Подбор параметров модели

In [48]:
score_train = 0
score_test = 0
best_dep = 0
for i in range(100, 2000, 100):
        model = RF(n_estimators = i,  max_depth = 15, random_state = 42, n_jobs = 8)
        model.fit(X_train, y_train)
        pred_train = model.predict(X_train)
        r2_train = r2(y_train, pred_train)
        pred_test = model.predict(X_test)
        r2_test = r2(y_test, pred_test)
        if r2_test > score_test:
                score_test = r2_test
                score_train = r2_train
                best_dep = i
print(score_test, best_dep, score_train)

0.7495526271439328 1600 0.9419684107945676


In [49]:
model = RF(n_estimators = 1600,  max_depth = 15, random_state = 42, n_jobs = 8)

In [50]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1600, n_jobs=8,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [51]:
pred_train = model.predict(X_train)

In [52]:
r2_train = r2(y_train, pred_train)

In [53]:
pred_test = model.predict(X_test)

In [54]:
r2_test = r2(y_test, pred_test)

In [55]:
print(r2_test)

0.7495526271439328


<H1>Работа с тестовым датасетом</H1>

In [56]:
ts = pd.read_csv('test.csv')

In [57]:
test = ts.copy()

<h2>Предобработка данных</h2>

In [58]:
test = test.drop(['Healthcare_1'], axis = 1)

In [59]:
test = pd.get_dummies(test)

Исправление ошибок в данных

In [60]:
test.loc[test['HouseYear']].describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_2,Social_3,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
count,5000.0,5000.0,5000.0,5000.0,4311.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,10284.8736,59.3186,1.6364,52.489499,32.196843,14.7106,7.3788,12.319,1985.5664,0.210495,...,5631.6666,6.3086,1.2984,4.472,0.0176,0.9824,0.0334,0.9666,0.068,0.932
std,4959.119702,40.749621,0.72519,20.619943,16.425226,70.647652,4.579859,5.345667,17.860442,0.171325,...,3216.781265,18.299069,1.637282,3.813781,0.131506,0.131506,0.179697,0.179697,0.251771,0.251771
min,164.0,0.0,1.0,29.708662,2.712936,0.0,1.0,0.0,1953.0,0.0,...,168.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5439.0,27.0,1.0,35.124873,21.465834,5.0,4.0,9.0,1976.0,0.041125,...,3893.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
50%,11253.0,58.0,1.0,45.596795,27.662295,6.0,6.0,12.0,1981.0,0.174739,...,5735.0,3.0,0.5,5.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,14863.0,74.0,2.0,61.646444,38.905038,9.0,10.0,16.0,2002.0,0.437885,...,6564.0,4.0,3.0,5.0,0.0,1.0,0.0,1.0,0.0,1.0
max,16792.0,184.0,4.0,125.538178,126.473035,620.0,26.0,26.0,2019.0,0.437885,...,19083.0,141.0,6.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0


In [61]:
test['HouseYear'].describe()

count    5000.000000
mean     1984.392600
std        18.573149
min      1908.000000
25%      1973.000000
50%      1977.000000
75%      2000.000000
max      2020.000000
Name: HouseYear, dtype: float64

Новый признак: время застройки района

<H3> Год строительства </H3>

In [62]:
dist_mean_year = test.groupby(['DistrictId'])[['HouseYear']].mean().astype('int32').reset_index().rename(columns = {'HouseYear': 'Dist_mean_year'})
test = test.merge(dist_mean_year, on = ('DistrictId'), how ='left')

Новый признак: новостройка

In [63]:
test['New_House'] = (test['HouseYear'] >= 2018).astype('int32')

<H3> Площадь </H3>

Исправление ошибок в данных

In [64]:
test.loc[data.Rooms].describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
count,9985.0,9985.0,9985.0,9985.0,6264.0,9985.0,9985.0,9985.0,9985.0,9985.0,...,9985.0,9985.0,9985.0,9985.0,9985.0,9985.0,9985.0,9985.0,9985.0,9985.0
mean,11740.214522,112.315473,1.594191,48.083978,29.427535,7.063295,8.212719,7.524787,1957.871708,0.052043,...,2.615724,3.361042,0.0,1.0,0.0,1.0,0.015223,0.984777,1967.065198,0.0
std,5008.785501,62.936643,0.491072,27.714198,17.257342,4.95733,7.794345,8.170877,40.710758,0.042695,...,2.211307,1.390265,0.0,0.0,0.0,0.0,0.122444,0.122444,23.995631,0.0
min,5480.0,27.0,1.0,13.597819,15.948246,1.0,2.0,1.0,1909.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1937.0,0.0
25%,5480.0,74.0,1.0,13.597819,15.948246,1.0,2.0,1.0,1909.0,0.0,...,0.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1937.0,0.0
50%,15664.0,74.0,2.0,69.263183,15.948246,9.0,6.0,5.0,1977.0,0.075779,...,3.0,3.0,0.0,1.0,0.0,1.0,0.0,1.0,1986.0,0.0
75%,15856.0,190.0,2.0,69.263183,51.940842,12.0,6.0,5.0,1977.0,0.075779,...,5.0,5.0,0.0,1.0,0.0,1.0,0.0,1.0,1986.0,0.0
max,15856.0,190.0,2.0,73.046609,51.940842,12.0,22.0,22.0,2017.0,0.101872,...,5.0,5.0,0.0,1.0,0.0,1.0,1.0,1.0,1993.0,0.0


In [65]:
test.loc[test.LifeSquare>200]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
410,11533,94,2.0,48.713443,303.071094,6.0,5,12.0,1974,0.521867,...,0,0,0,1,0,1,0,1,1982,0


In [66]:
test.loc[(test.LifeSquare - test.Square > 5)] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
410,11533,94,2.0,48.713443,303.071094,6.0,5,12.0,1974,0.521867,...,0,0,0,1,0,1,0,1,1982,0
4713,170,6,2.0,2.900586,61.468563,1.0,18,17.0,2014,0.243205,...,0,0,0,1,0,1,0,1,1991,0


In [67]:
test.loc[[410],'LifeSquare']=30.307
test.loc[[4713],'Square']=61.468

In [68]:
test.loc[(test.Square<13)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Dist_mean_year,New_House
66,9011,53,1.0,1.378543,1.353573,1.0,1,1.0,1977,0.049637,...,1,3,0,1,0,1,0,1,1991,0
608,16401,30,1.0,2.645046,4.338755,1.0,2,1.0,1977,7.8e-05,...,3,23,0,1,0,1,0,1,1990,0
837,2138,27,1.0,5.647458,1.501582,1.0,1,1.0,1977,0.017647,...,0,0,0,1,0,1,0,1,1993,0
1165,10120,6,1.0,5.100672,3.86178,1.0,3,1.0,1977,0.243205,...,0,0,0,1,0,1,0,1,1991,0
4490,1165,27,1.0,2.372101,1.899119,1.0,2,17.0,1977,0.011654,...,0,0,0,1,0,1,0,1,1993,0
4540,7855,6,1.0,4.967143,2.968086,1.0,3,1.0,2018,0.243205,...,0,0,0,1,0,1,0,1,1991,1
4902,1420,45,1.0,1.975769,2.900371,1.0,1,1.0,1977,0.195781,...,3,2,0,1,0,1,0,1,1992,0


In [69]:
test.loc[[66],'Square'] = 13.785
test.loc[[608],'Square'] = 26.450
test.loc[[4490],'Square'] = 23.721
test.loc[[4490],'Square'] = 49.671
test.loc[[4902],'Square'] = 19.757

In [70]:
test.loc[(test.Square < 13), 'Square'] = 13

<H3>Этажность</H3>

Исправление ошибок

In [71]:
test.loc[test.Floor > test.HouseFloor,'HouseFloor'] = test.loc[test.Floor > test.HouseFloor,'Floor']

Новый признак: первый и последний этаж

In [72]:
test['frs_lst_floor'] = (test['Floor'] == test['HouseFloor']) | (test['Floor'] <= 1)
test['frs_lst_floor'] = test['frs_lst_floor'].astype('int')

Проверка

In [73]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 24 columns):
Id                5000 non-null int64
DistrictId        5000 non-null int64
Rooms             5000 non-null float64
Square            5000 non-null float64
LifeSquare        3959 non-null float64
KitchenSquare     5000 non-null float64
Floor             5000 non-null int64
HouseFloor        5000 non-null float64
HouseYear         5000 non-null int64
Ecology_1         5000 non-null float64
Social_1          5000 non-null int64
Social_2          5000 non-null int64
Social_3          5000 non-null int64
Helthcare_2       5000 non-null int64
Shops_1           5000 non-null int64
Ecology_2_A       5000 non-null uint8
Ecology_2_B       5000 non-null uint8
Ecology_3_A       5000 non-null uint8
Ecology_3_B       5000 non-null uint8
Shops_2_A         5000 non-null uint8
Shops_2_B         5000 non-null uint8
Dist_mean_year    5000 non-null int32
New_House         5000 non-null int32
frs_lst

<H2>Предсказание цены</H2>

In [74]:
price_pred = model.predict(test[features])

In [75]:
test['Price'] = price_pred

In [76]:
test[['Id', 'Price']].to_csv('YChizhevskiy_predictions.csv', index = None)