# Задание:
Используя данные из обучающего датасета (train.csv), построить модель для предсказания цен на недвижимость (квартиры).
С помощью полученной модели, предсказать цены для квартир из тестового датасета (test.csv).

### Целевая переменная:
Price

### Метрика качества:
R2 - коэффициент детерминации (sklearn.metrics.r2_score)

### Требования к решению:
1. R2 > 0.6
2. Тетрадка Jupyter Notebook с кодом Вашего решения, названная по образцу {ФИО}_solution.ipynb, пример SShirkin_solution.ipynb
3. Файл CSV с прогнозами целевой переменной для тестового датасета, названный по образцу {ФИО}_predictions.csv, пример SShirkin_predictions.csv 
Файл должен содержать два поля: Id, Price и в файле должна быть 5001 строка (шапка + 5000 предсказаний).

### Библиотеки:

In [249]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

### Загрузка и обработка данных:

In [265]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
train_dataset.drop(['Healthcare_1'], axis = 'columns', inplace=True)
train_dataset.replace({'Ecology_2' : {'A' : 0, 'B' : 1}, 'Ecology_3' : {'A' : 0, 'B' : 1}, 'Shops_2' : {'A' : 0, 'B' : 1}}, inplace=True)
dic_med_price = train_dataset.groupby('DistrictId')[['Price']].median().reset_index()
ec_med_price = train_dataset.groupby('Ecology_1')[['Price']].median().reset_index()
train_dataset = pd.merge(train_dataset, dic_med_price, how ='left', on ='DistrictId')
train_dataset = pd.merge(train_dataset, ec_med_price, how ='left', on ='Ecology_1')
train_dataset.drop(['DistrictId', 'Ecology_1'], axis = 'columns', inplace=True)
target = train_dataset.Price_x
train_dataset['target'] = target
train_dataset.drop(['Price_x', 'Ecology_2', 'Ecology_3', 'Helthcare_2', 'Shops_2'], axis = 'columns', inplace=True)
train_dataset.head(10)


Unnamed: 0,Id,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Social_1,Social_2,Social_3,Shops_1,Price_y,Price,target
0,14038,2.0,47.981561,29.442751,6.0,7,9.0,1969,33,7976,5,11,203602.408898,216329.642379,184966.93073
1,15053,3.0,65.68364,40.049543,8.0,7,9.0,1978,46,10309,1,16,210694.850106,205321.620544,300009.450063
2,4765,2.0,44.947953,29.197612,0.0,8,12.0,1968,34,7759,0,3,245978.794474,253080.425473,220925.908524
3,5809,2.0,53.352981,52.731512,9.0,8,17.0,1977,23,5735,3,5,151557.904767,151557.904767,175616.227217
4,10783,1.0,39.649192,23.776169,7.0,11,12.0,1976,35,5776,1,4,178829.16645,187118.870974,150226.531644
5,12915,3.0,80.384479,46.68372,12.0,5,17.0,2011,35,7715,4,6,318772.37474,187656.177472,215898.447742
6,14549,2.0,62.254114,37.160377,7.0,3,5.0,1960,20,4386,14,5,279053.933157,244852.013738,296021.204377
7,11993,2.0,80.312926,,0.0,14,0.0,1977,6,1437,3,2,195610.960042,159768.174822,221244.156664
8,5172,2.0,64.511437,,1.0,9,17.0,1977,1,264,0,1,183663.443595,183882.806975,229102.795999
9,8649,1.0,46.461409,18.915552,8.0,13,17.0,2014,6,1437,3,2,113789.561738,159768.174822,95380.220993


### Обработака нулевых значений:

In [266]:
print(train_dataset.isnull().sum())
X = train_dataset.iloc[:, 1 :-1].values
y = train_dataset.iloc[:, -1].values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X = imputer.transform(X)
X

Id                  0
Rooms               0
Square              0
LifeSquare       2113
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Social_1            0
Social_2            0
Social_3            0
Shops_1             0
Price_y             0
Price               0
target              0
dtype: int64


array([[2.00000000e+00, 4.79815612e+01, 2.94427505e+01, ...,
        1.10000000e+01, 2.03602409e+05, 2.16329642e+05],
       [3.00000000e+00, 6.56836399e+01, 4.00495425e+01, ...,
        1.60000000e+01, 2.10694850e+05, 2.05321621e+05],
       [2.00000000e+00, 4.49479528e+01, 2.91976117e+01, ...,
        3.00000000e+00, 2.45978794e+05, 2.53080425e+05],
       ...,
       [1.00000000e+00, 4.79390077e+01, 3.71996448e+01, ...,
        0.00000000e+00, 1.46171433e+05, 1.65691439e+05],
       [2.00000000e+00, 4.36025615e+01, 3.38401475e+01, ...,
        5.00000000e+00, 1.94932011e+05, 1.91289970e+05],
       [1.00000000e+00, 3.86666450e+01, 2.11578744e+01, ...,
        8.00000000e+00, 2.39810184e+05, 2.39973726e+05]])

### Разбиение данных на тренировочную и тестовую часть. Стандартизация данных.

In [267]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
sc = StandardScaler()
X_train[:, :] = sc.fit_transform(X_train[:, :])
X_test[:, :] = sc.transform(X_test[:, :])

### Тренировка модели.

In [268]:
model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [269]:
new_predict = model.predict(X_test)
print(r2_score(y_test, new_predict))

0.7420422910728484


### Проверка модели на тестовых данных

In [270]:
test_dataset = pd.read_csv('test.csv')
test_dataset.drop(['Healthcare_1'], axis = 'columns', inplace=True)
test_dataset.replace({'Ecology_2' : {'A' : 0, 'B' : 1}, 'Ecology_3' : {'A' : 0, 'B' : 1}, 'Shops_2' : {'A' : 0, 'B' : 1}}, inplace=True)
test_dataset = pd.merge(test_dataset, dic_med_price, how ='left', on ='DistrictId')
test_dataset = pd.merge(test_dataset, ec_med_price, how ='left', on ='Ecology_1')
test_dataset.drop(['DistrictId', 'Ecology_1'], axis = 'columns', inplace=True)
test_dataset.drop([ 'Ecology_2', 'Ecology_3', 'Helthcare_2', 'Shops_2'], axis = 'columns', inplace=True)
test_dataset.rename(columns={'Price_x': 'Price_y', 'Price_y': 'Price'}, inplace=True)
test_dataset.head(10)

Unnamed: 0,Id,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Social_1,Social_2,Social_3,Shops_1,Price_y,Price
0,725,2.0,49.882643,33.432782,6.0,6,14.0,1972,11,2748,1,0,151557.904767,155335.327454
1,15856,2.0,69.263183,,1.0,6,1.0,1977,6,1437,3,2,195610.960042,159768.174822
2,5480,1.0,13.597819,15.948246,12.0,2,5.0,1909,30,7538,87,5,526438.458919,239973.726358
3,15664,2.0,73.046609,51.940842,9.0,22,22.0,2007,23,4583,3,3,196429.659238,221914.062737
4,14275,1.0,47.527111,43.387569,1.0,17,17.0,2017,2,629,1,0,146171.43319,165691.43911
5,7633,1.0,40.675627,,1.0,21,21.0,1977,34,7759,0,3,245978.794474,253080.425473
6,13329,2.0,68.099538,64.843025,1.0,2,17.0,1977,6,1437,3,2,113789.561738,159768.174822
7,5502,2.0,48.193897,32.857644,6.0,5,14.0,1972,46,7960,6,11,234647.811956,239331.050596
8,4220,3.0,72.277106,45.968758,9.0,17,17.0,1997,53,14892,4,4,200440.676982,183347.994462
9,11538,3.0,80.2194,47.66026,9.0,13,17.0,2014,5,1564,0,0,168980.370846,168980.370846


In [271]:
print(test_dataset.isnull().sum())
X_test = test_dataset.iloc[:,1 :].values
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(X_test)
X_test = imputer.transform(X_test)
X_test

Id                  0
Rooms               0
Square              0
LifeSquare       1041
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Social_1            0
Social_2            0
Social_3            0
Shops_1             0
Price_y             8
Price               5
dtype: int64


array([[2.00000000e+00, 4.98826434e+01, 3.34327817e+01, ...,
        0.00000000e+00, 1.51557905e+05, 1.55335327e+05],
       [2.00000000e+00, 6.92631835e+01, 3.29250866e+01, ...,
        2.00000000e+00, 1.95610960e+05, 1.59768175e+05],
       [1.00000000e+00, 1.35978185e+01, 1.59482464e+01, ...,
        5.00000000e+00, 5.26438459e+05, 2.39973726e+05],
       ...,
       [3.00000000e+00, 7.78421779e+01, 4.82826251e+01, ...,
        1.50000000e+01, 2.01169579e+05, 2.28453854e+05],
       [2.00000000e+00, 8.13052220e+01, 3.29250866e+01, ...,
        0.00000000e+00, 1.62067275e+05, 1.65691439e+05],
       [2.00000000e+00, 6.05556927e+01, 3.29250866e+01, ...,
        2.30000000e+01, 1.64259100e+05, 1.67082631e+05]])

In [272]:
sc = StandardScaler()
X_test[:, :] = sc.fit_transform(X_test[:, :])
test_predict = model.predict(X_test)
print(len(test_predict))

5000


In [273]:
test_dataset['Price'] = test_predict
test_dataset[['Id', 'Price']].to_csv('predict_2.csv', index=False)
test_dataset.head(10)

Unnamed: 0,Id,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Social_1,Social_2,Social_3,Shops_1,Price_y,Price
0,725,2.0,49.882643,33.432782,6.0,6,14.0,1972,11,2748,1,0,151557.904767,154141.518384
1,15856,2.0,69.263183,,1.0,6,1.0,1977,6,1437,3,2,195610.960042,232752.938511
2,5480,1.0,13.597819,15.948246,12.0,2,5.0,1909,30,7538,87,5,526438.458919,363802.545014
3,15664,2.0,73.046609,51.940842,9.0,22,22.0,2007,23,4583,3,3,196429.659238,333676.567908
4,14275,1.0,47.527111,43.387569,1.0,17,17.0,2017,2,629,1,0,146171.43319,144665.446989
5,7633,1.0,40.675627,,1.0,21,21.0,1977,34,7759,0,3,245978.794474,206490.078567
6,13329,2.0,68.099538,64.843025,1.0,2,17.0,1977,6,1437,3,2,113789.561738,174202.441666
7,5502,2.0,48.193897,32.857644,6.0,5,14.0,1972,46,7960,6,11,234647.811956,205335.877626
8,4220,3.0,72.277106,45.968758,9.0,17,17.0,1997,53,14892,4,4,200440.676982,286008.686248
9,11538,3.0,80.2194,47.66026,9.0,13,17.0,2014,5,1564,0,0,168980.370846,218429.163456
