In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
df = pd.read_csv('flats_cleaned.csv')
df.head()

Unnamed: 0,price,district,rooms,floor,metro,total_area,kitchen_area,floor_total
0,26000,Солом'янський,2,14.0,1,60.0,15.0,25.0
1,95750,Печерський,3,10.0,1,131.0,15.0,25.0
2,57450,Печерський,2,2.0,1,87.0,17.0,23.0
3,8000,Дарницький,1,3.0,1,40.0,9.0,16.0
4,15000,Дарницький,1,15.0,1,38.0,16.0,25.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14946 entries, 0 to 14945
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         14946 non-null  int64  
 1   district      14946 non-null  object 
 2   rooms         14946 non-null  int64  
 3   floor         14946 non-null  float64
 4   metro         14946 non-null  int64  
 5   total_area    14946 non-null  float64
 6   kitchen_area  14946 non-null  float64
 7   floor_total   14913 non-null  float64
dtypes: float64(4), int64(3), object(1)
memory usage: 934.3+ KB


## Train test split

In [28]:
from sklearn.model_selection import train_test_split

y =df['price']
X = df.drop(['price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

## Pipeline

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.compose import ColumnTransformer
imputer = KNNImputer(n_neighbors=3)
scaler = StandardScaler()
pipe_num = Pipeline([('imputer', imputer), ('scaler' , scaler)])
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [30]:
col_transformer = ColumnTransformer([('num_preproc', pipe_num ,np.arange(1,7)), ('cat_preproc', encoder, [0])])

In [31]:
X_train = col_transformer.fit_transform(X_train)
X_test = col_transformer.transform(X_test)

In [32]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## Models training

In [33]:
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
val_results = pd.DataFrame(columns=['model', 'params', 'r2', 'train_r2'])
models = []

### Lasso regression

In [34]:
from sklearn.linear_model import Lasso
params = {'alpha': [0.0001, 0.001, 0.01, 0.1]}
ls = Lasso(random_state=4)

grid = GridSearchCV(ls, params, cv=5, scoring='r2')
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2_train= grid.best_score_
models.append(grid.best_estimator_)

In [35]:
val_results.loc[len(val_results)] = ['Lasso Regression', grid.best_params_, r2, r2_train]
val_results

Unnamed: 0,model,params,r2,train_r2
0,Lasso Regression,{'alpha': 0.0001},0.730682,0.747295


### Ridge regression

In [36]:
from sklearn.linear_model import Ridge
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 1.5]}
rd = Ridge(random_state=4)

grid = GridSearchCV(rd, params, cv=5, scoring='r2')
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2_train= grid.best_score_
models.append(grid.best_estimator_)

In [37]:
val_results.loc[len(val_results)] = ['Ridge Regression', grid.best_params_, r2, r2_train]
val_results

Unnamed: 0,model,params,r2,train_r2
0,Lasso Regression,{'alpha': 0.0001},0.730682,0.747295
1,Ridge Regression,{'alpha': 0.1},0.730678,0.747298


### Desision Tree Regression

In [38]:
from sklearn.tree import DecisionTreeRegressor
params = {    'max_depth': range (6, 21, 2),
              'min_samples_leaf': range (2,8),
              'min_samples_split': range (2,10,2) }
dt = DecisionTreeRegressor()

grid = GridSearchCV(dt, params, cv=5, scoring='r2')
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2_train= grid.best_score_
models.append(grid.best_estimator_)

In [39]:
val_results.loc[len(val_results)] = ['DecisionTree Regression', grid.best_params_, r2, r2_train]
val_results

Unnamed: 0,model,params,r2,train_r2
0,Lasso Regression,{'alpha': 0.0001},0.730682,0.747295
1,Ridge Regression,{'alpha': 0.1},0.730678,0.747298
2,DecisionTree Regression,"{'max_depth': 20, 'min_samples_leaf': 2, 'min_...",0.846614,0.83074


### Random forest

In [40]:
from sklearn.ensemble import RandomForestRegressor
params = { 'n_estimators': range (60, 200, 10),
              'max_depth': range (6, 13, 1) }
rf = RandomForestRegressor(random_state=4)

grid = GridSearchCV(rf, params, cv=5, scoring='r2')
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2_train= grid.best_score_
models.append(grid.best_estimator_)

In [41]:
val_results.loc[len(val_results)] = ['RandomForestRegressor', grid.best_params_, r2, r2_train]
val_results

Unnamed: 0,model,params,r2,train_r2
0,Lasso Regression,{'alpha': 0.0001},0.730682,0.747295
1,Ridge Regression,{'alpha': 0.1},0.730678,0.747298
2,DecisionTree Regression,"{'max_depth': 20, 'min_samples_leaf': 2, 'min_...",0.846614,0.83074
3,RandomForestRegressor,"{'max_depth': 12, 'n_estimators': 180}",0.867252,0.866094


#### Gradient Boosting

In [42]:
from sklearn.ensemble import GradientBoostingRegressor
params = { 'learning_rate': [0.01, 0.1, 1],
            'n_estimators': range (250, 351, 50),
            'max_depth': range (9, 13) }
gb = GradientBoostingRegressor(random_state=4)

grid = GridSearchCV(gb, params, cv=5, scoring='r2')
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2_train= grid.best_score_
models.append(grid.best_estimator_)

In [43]:
val_results.loc[len(val_results)] = ['GradientBoosting Regression', grid.best_params_, r2, r2_train]
val_results

Unnamed: 0,model,params,r2,train_r2
0,Lasso Regression,{'alpha': 0.0001},0.730682,0.747295
1,Ridge Regression,{'alpha': 0.1},0.730678,0.747298
2,DecisionTree Regression,"{'max_depth': 20, 'min_samples_leaf': 2, 'min_...",0.846614,0.83074
3,RandomForestRegressor,"{'max_depth': 12, 'n_estimators': 180}",0.867252,0.866094
4,GradientBoosting Regression,"{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",0.919743,0.900379


In [44]:
grid.best_estimator_

### Best model params

In [45]:
val_results['params'][4]

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300}

### Model and pipeline saving

In [46]:
import copy
import pickle
new_transformer = copy.deepcopy(col_transformer)
num_pipeline = new_transformer.transformers.pop(0)
pipeline = num_pipeline[1]
pipeline.steps.pop(0)
new_transformer.transformers.insert(0, num_pipeline)

output_file = "model&pipeline.bin"

with open(output_file, 'wb') as f_out:
    pickle.dump((new_transformer, grid.best_estimator_), f_out)