# Data loading

In [None]:
import sqlite3
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format ='{:,.2f}'.format

In [3]:
con = sqlite3.connect('old_db.db')

In [4]:
df = pd.read_sql('SELECT f.id_flat, f.room, f.square, f.lat, f.lon, f.description, f.price FROM flats f;', con)

In [5]:
pattern = '(\d{4}) г'
df['year'] = df.description.str.extract(pattern, expand=False)

In [6]:
df['year'] = df['year'].astype('int64')

In [7]:
def home_type(x):
    if 'кирпичный' in x:
        return 'кирпичный дом'
    if 'монолитный' in x:
        return 'монолитный дом'
    if 'панельный' in x:
        return 'панельный дом'
    return 'другой'

In [8]:
df['home_type'] = df['description'].apply(home_type)

In [9]:
df = df.drop(['description'], axis=1)

In [10]:
df.head()

Unnamed: 0,id_flat,room,square,lat,lon,price,year,home_type
0,686141223,2,52.9,43.24,76.89,41000000,1980,панельный дом
1,684937218,1,57.9,43.17,76.9,29580000,2022,монолитный дом
2,686513752,2,65.9,43.32,77.02,28900000,2023,монолитный дом
3,685684495,1,29.0,43.21,76.89,25000000,2023,монолитный дом
4,686240870,3,78.0,43.29,76.94,52000000,2014,монолитный дом


In [11]:
y = df['price']
X = df.drop('price', axis=1)

# Training and test sample

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Transformation Pipelines

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])


In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, ['room', 'square', 'lat', 'lon', 'year']),
        ("cat", OneHotEncoder(), ['home_type']),
    ])

housing_prepared = full_pipeline.fit_transform(X_train)

In [15]:
housing_prepared[0]

array([-1.19321874, -1.00441512, -0.02840532, -0.06518303,  0.92077622,
        0.        ,  0.        ,  1.        ,  0.        ])

In [16]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

def error(y_true,y_pred):
    print(mean_absolute_error(y_true,y_pred))
    print(mean_absolute_percentage_error(y_true,y_pred))

lin_reg = LinearRegression()    
lin_reg.fit(housing_prepared, y_train)

housing_predictions = lin_reg.predict(housing_prepared)
error(y_train, housing_predictions)


10973520.149264205
0.2606096600094067


In [17]:
housing_test = full_pipeline.fit_transform(X_test)
error(y_test, lin_reg.predict(housing_test))

11162438.3909363
0.26077974913803825


# Better Evaluation Using Cross-Validation

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer


scoring = make_scorer(mean_absolute_error)

tree_reg = DecisionTreeRegressor(random_state=42)

scores = cross_val_score(tree_reg, housing_prepared, y_train,
                         scoring=scoring, cv=10)
scores.mean()

7804598.049250238

In [19]:
reg = LinearRegression()
reg_score = cross_val_score(reg, housing_prepared, y_train, scoring=scoring, cv=10)
reg_score.mean()

10992983.706963968

In [20]:
housing_test = full_pipeline.fit_transform(X_test)

In [21]:
housing_test_pred = lin_reg.predict(housing_test)

In [22]:
error(y_test, housing_test_pred)

11162438.3909363
0.26077974913803825


In [23]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, y_train)

In [24]:
housing_predictions = forest_reg.predict(housing_prepared)

In [25]:
error(y_train, housing_predictions)

2199929.5106925843
0.04731157739110398


In [26]:
housing_test_pred = forest_reg.predict(housing_test)

In [27]:
error(y_test, housing_test_pred)

9465530.51489916
0.20800521530727067


In [28]:
scores = cross_val_score(forest_reg, housing_prepared, y_train,
                         scoring=scoring, cv=10)
scores.mean()

5920113.533503781

In [29]:
5769482 / y_train.mean()

0.12722435711066493

# Grid Search

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring=scoring,
                           return_train_score=True)
grid_search.fit(housing_prepared, y_train)

In [60]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 3}

In [61]:
grid_search.best_estimator_

In [64]:
import numpy as np

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

6948501.870185805 {'max_features': 2, 'n_estimators': 3}
6143956.717890289 {'max_features': 2, 'n_estimators': 10}
5828166.01318497 {'max_features': 2, 'n_estimators': 30}
6696000.173407483 {'max_features': 4, 'n_estimators': 3}
6013832.012545468 {'max_features': 4, 'n_estimators': 10}
5778235.324723291 {'max_features': 4, 'n_estimators': 30}
6592281.409838788 {'max_features': 6, 'n_estimators': 3}
6007185.077151522 {'max_features': 6, 'n_estimators': 10}
5848790.069690371 {'max_features': 6, 'n_estimators': 30}
6666172.7562467335 {'max_features': 8, 'n_estimators': 3}
6015712.701244492 {'max_features': 8, 'n_estimators': 10}
5880493.212550426 {'max_features': 8, 'n_estimators': 30}
6753024.547185177 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
6113301.901550042 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
6677663.808705701 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
6070209.046479545 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10

In [65]:
housing_prepared.shape

(11908, 9)