# Стоимость дома

In [1]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

## Загрузим данные

In [2]:
df = pd.read_excel("houses.xlsx")
train, test = train_test_split(df, test_size=0.2, random_state=42)
print len(train), len(test)

17290 4323


In [3]:
train.head(5)

Unnamed: 0,price,bathrooms,sqft_living,waterfront,view,condition,grade,yr_built,zipcode,lat,long
6325,430000,1.5,1560,0,0,4,7,1962,98007,47.6012,-122.152
13473,86500,1.0,840,0,0,3,6,1960,98023,47.3277,-122.341
17614,287500,2.0,1760,0,0,4,6,1936,98055,47.4799,-122.232
16970,395000,1.0,1100,0,0,3,7,1950,98105,47.6701,-122.286
20868,295000,2.0,1760,0,0,3,7,1998,98030,47.3838,-122.184


## Подготовим признаки

In [4]:
train_factors = train.drop(["price"], axis=1)
train_target = train["price"]

test_factors = test.drop(["price"], axis=1)
test_target = test["price"]

In [5]:
# нормировка признаков
scaler = StandardScaler()
train_factors = scaler.fit_transform(train_factors)
test_factors = scaler.transform(test_factors)

In [6]:
# посмотрим на нормированные признаки
pd.DataFrame(train_factors, columns=train.columns[1:]).head(4)

Unnamed: 0,bathrooms,sqft_living,waterfront,view,condition,grade,yr_built,zipcode,lat,long
0,-0.795076,-0.566231,-0.086701,-0.308659,0.906493,-0.557661,-0.304035,-1.324888,0.288964,0.435537
1,-1.44252,-1.346468,-0.086701,-0.308659,-0.631609,-1.40538,-0.372139,-1.026161,-1.683103,-0.899824
2,-0.147632,-0.349498,-0.086701,-0.308659,0.906493,-1.40538,-1.18939,-0.428707,-0.585668,-0.129695
3,-1.44252,-1.064716,-0.086701,-0.308659,-0.631609,-0.557661,-0.71266,0.504814,0.785766,-0.511227


## Линейная модель

In [7]:
# настроим модель
model = LinearRegression()
model.fit(train_factors, train_target)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
# посчитаем качество
test_predictions = model.predict(test_factors)
print 'RMSE:', np.sqrt(mean_squared_error(test_target, test_predictions))

RMSE: 203114.027309


In [9]:
# веса
pd.DataFrame(sorted(zip(model.coef_, train.columns[1:]), reverse=True), columns=["weight", "feature"])

Unnamed: 0,weight,feature
0,142926.888109,sqft_living
1,132216.174524,grade
2,82538.108268,lat
3,50558.832467,waterfront
4,40962.625858,view
5,23057.596589,bathrooms
6,12271.930114,condition
7,-25379.767021,long
8,-29444.769825,zipcode
9,-75398.103932,yr_built


## Одно дерево решений

In [10]:
model = DecisionTreeRegressor(random_state=42)
model.fit(train_factors, train_target)
test_predictions = model.predict(test_factors)
print 'RMSE:', np.sqrt(mean_squared_error(test_target, test_predictions))

RMSE: 172668.946199


## Градиентный бустинг

In [11]:
model = GradientBoostingRegressor()
model.fit(train_factors, train_target)
test_predictions = model.predict(test_factors)
print 'RMSE:', np.sqrt(mean_squared_error(test_target, test_predictions))

RMSE: 130771.225761
