# IMPORT

In [272]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from joblib import dump

import neptune.new as neptune
import optuna
import neptune.new.integrations.optuna as optuna_utils

In [263]:
# train с новыми признаками
train = pd.read_csv('../data/train_new_features.csv',
                    dtype={'floors': str,
                           'total_floors': str})

# test с новыми признаками
test = pd.read_csv('../data/test_new_features.csv',
                    dtype={'floors': str,
                           'total_floors': str})

# Sklearn Linear Regression

In [221]:
def fill_missing_values(data, fill_method='mean'):
    
    """ 
        Заполняет пропуски четырмя способами: mean, median, KNN, MICE. 
    
        data - датафрейм, который нужно обработать.
        fill_method - метод заполнения пропусков (mean, median, knn, mice)
        
        output: data with fiiled NaN's.
    """
    
    print(f'Fill method: {fill_method}')
    
    if fill_method == 'mean':
    
        # Mean
        df_mean = data.copy()
        imp_mean = SimpleImputer(strategy='mean')
        y_mean = np.round(imp_mean.fit_transform(df_mean[['kitchen_square', 'live_square']]), 1)
        df_mean['kitchen_square'] = pd.DataFrame(y_mean)[0]
        df_mean['live_square'] = pd.DataFrame(y_mean)[1]
        
        return df_mean

    elif fill_method == 'median':
        
        # Median
        df_median = data.copy()
        imp_median = SimpleImputer(strategy='median')
        y_median = np.round(imp_median.fit_transform(df_median[['kitchen_square', 'live_square']]), 1)
        df_median['kitchen_square'] = pd.DataFrame(y_median)[0]
        df_median['live_square'] = pd.DataFrame(y_median)[1]
        
        return df_median
    
    elif fill_method == 'knn':

        # KNN
        df_knn = data.copy()
        x_knn = df_knn[['square', 'kitchen_square', 'live_square']]
        imp_knn = KNNImputer(n_neighbors=5)
        imp_knn.fit(x_knn)
        x_knn = np.round(imp_knn.transform(x_knn), 1)
        y_knn = pd.DataFrame(x_knn)
        df_knn['kitchen_square'] = y_knn[1]
        df_knn['live_square'] = y_knn[2]
        
        return df_knn
    
    elif fill_method == 'mice':

        # MICE
        df_mice = data.copy()
        x_mice = df_mice[['square', 'kitchen_square', 'live_square']]
        mice_imp = IterativeImputer(max_iter=10, random_state=42)
        mice_imp.fit(x_mice)
        x_mice = np.round(mice_imp.transform(x_mice), 1)
        y_mice = pd.DataFrame(x_mice)
        df_mice['kitchen_square'] = y_mice[1]
        df_mice['live_square'] = y_mice[2]
        
        return df_mice

In [115]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    return rmse, mae, r2, mape

In [259]:
def nt_sklearn_lr(data, fill_method, squares, norm=False):
    
    df = data.copy()
    df.reset_index(inplace=True)
    
    run = neptune.init(project='alxkzncoff/flats',
                       api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5ZmI4NDE1Ny03YzcyLTQwOTQtOTUwMi1kODlkYWMwY2YwOGQifQ==',
                       name = 'sklearn_regression')

    # Fill NaN
    df_filled = fill_missing_values(df, fill_method)

    # Dummies
    df_dummies = pd.get_dummies(df_filled, 
                                columns=['flat_type', 'object_type', 'rooms', 'build_matireal',
                                        'district_rating','district','underground','eco_rating','clear_rating','gkh_rating',
                                        'neighbor_rating','kids_rating','sport_rest_rating','shop_rating',
                                        'traffic_rating','secure_rating','life_price_rating','metro_station',
                                        'num_of_metro_stations','num_of_kindg','num_of_schools','num_of_poly',
                                        'num_of_hospitals','num_of_dentists','num_of_women_cons'])

    X = df_dummies.drop(['page', 'description', 'price'], axis=1)
    y = df_dummies['price']
    
    # Normalization
    print(f'Normalization: {norm}')
    if norm == True:
        MMS = MinMaxScaler()
        X[['square', 'kitchen_square', 'live_square']] = MMS.fit_transform(X[['square', 'kitchen_square', 'live_square']])
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    predict = lr.predict(X_test)
    (rmse, mae, r2, mape) = eval_metrics(y_test, predict)

    run["sys/tags"].add(['sklearn_LinearReg'])
    run['sk_LinearRegression/parameters'] = {'fill method': fill_method, 
                                             'normalization': norm,
                                             'squares': squares}
    run['sk_LinearRegression/rmse'] = np.round(rmse, 2)
    run['sk_LinearRegression/mae'] = np.round(mae, 2)
    run['sk_LinearRegression/r2'] = np.round(r2, 2)
    run['sk_LinearRegression/mape'] = np.round(mape, 2)
    run["sk_LinearRegression/model"].upload('sklearn_linear_regression_model.pkl')

    print(f'RMSE: {np.round(rmse, 2)} | MAE: {np.round(mae, 2)} | R2: {np.round(r2, 2)} | MAPE: {np.round(mape, 2)}')

    dump(lr, 'sklearn_linear_regression_model.pkl')

## All sqaures

In [230]:
for norm in [False, True]:
    for fill_method in ['mean', 'median', 'knn', 'mice']:
        nt_sklearn_lr(train, fill_method, 'all', norm)

https://app.neptune.ai/alxkzncoff/flats/e/FLAT-82
Fill method: mean
Normalization: False
RMSE: 7687833758158.7 | MAE: 55631077471.68 | R2: -35309877014.65 | MAPE: 20992.18
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-83
Fill method: median
Normalization: False
RMSE: 77896723456625.16 | MAE: 563658074162.18 | R2: -3625155320395.92 | MAPE: 212700.48
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-84
Fill method: knn
Normalization: False
RMSE: 7169733012219.74 | MAE: 51882154915.75 | R2: -30711017468.21 | MAPE: 19577.48
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-85
Fill method: mice
Normalization: False
RMSE: 56437329289101.72 | MAE: 408379312044.4 | R2: -1902922695367.01 | MAPE: 154104.71
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-86
Fill method: mean
Normalization: True
RMSE: 40057076.88 | MAE: 2408590.74 | R2: 0.04 | MAPE: 0.23
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-87
Fill method: median
Normalization: True
RMSE: 40059699.21 | MAE: 2409448.87 | R2: 0.04 | MAPE: 0.23
h

## < 400 m2

In [264]:
train_400m = train[(train['square']<400) & (train['price']<100000000)]

In [265]:
for norm in [False, True]:
    for fill_method in ['mean', 'median', 'knn', 'mice']:
        nt_sklearn_lr(train_400m, fill_method, '<400 m2', norm)

https://app.neptune.ai/alxkzncoff/flats/e/FLAT-105
Fill method: mean
Normalization: False
RMSE: 3662937.59 | MAE: 1817266.4 | R2: 0.84 | MAPE: 0.19
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-106
Fill method: median
Normalization: False
RMSE: 3663345.16 | MAE: 1818886.62 | R2: 0.84 | MAPE: 0.19
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-107
Fill method: knn
Normalization: False
RMSE: 3662809.55 | MAE: 1815584.71 | R2: 0.84 | MAPE: 0.19
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-108
Fill method: mice
Normalization: False
RMSE: 3676481.13 | MAE: 1819995.51 | R2: 0.83 | MAPE: 0.19
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-109
Fill method: mean
Normalization: True
RMSE: 3662937.59 | MAE: 1817266.4 | R2: 0.84 | MAPE: 0.19
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-110
Fill method: median
Normalization: True
RMSE: 3663345.16 | MAE: 1818886.62 | R2: 0.84 | MAPE: 0.19
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-111
Fill method: knn
Normalization: True
RMSE: 3662809.55 | 

## <130 m2

In [266]:
train_130m = train[(train['square']<131) & (train['price']<100000000)]

In [267]:
for norm in [False, True]:
    for fill_method in ['mean', 'median', 'knn', 'mice']:
        nt_sklearn_lr(train_130m, fill_method, '<130 m2', norm)

https://app.neptune.ai/alxkzncoff/flats/e/FLAT-113
Fill method: mean
Normalization: False
RMSE: 2496131.99 | MAE: 1392750.16 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-114
Fill method: median
Normalization: False
RMSE: 2496811.17 | MAE: 1393428.18 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-115
Fill method: knn
Normalization: False
RMSE: 2490684.24 | MAE: 1389889.09 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-116
Fill method: mice




Normalization: False
RMSE: 2503514.23 | MAE: 1394653.41 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-117
Fill method: mean
Normalization: True
RMSE: 2496131.99 | MAE: 1392750.16 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-118
Fill method: median
Normalization: True
RMSE: 2496811.17 | MAE: 1393428.18 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-119
Fill method: knn
Normalization: True
RMSE: 2490684.24 | MAE: 1389889.09 | R2: 0.83 | MAPE: 0.17
https://app.neptune.ai/alxkzncoff/flats/e/FLAT-120
Fill method: mice




Normalization: True
RMSE: 2503514.23 | MAE: 1394653.41 | R2: 0.83 | MAPE: 0.17


# Sklearn Ridge

In [282]:
train_knn = fill_missing_values(train, 'knn')

Fill method: knn


In [283]:
train_knn.to_csv('../data/train_knn', index=False)

In [None]:
def nt_sklearn_ridge(data, fill_method, squares, norm=False):
    
    df = data.copy()
    df.reset_index(inplace=True)
    
    run = neptune.init(project='alxkzncoff/flats',
                       api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5ZmI4NDE1Ny03YzcyLTQwOTQtOTUwMi1kODlkYWMwY2YwOGQifQ==',
                       name = 'sklearn_regression')

    # Fill NaN
    df_filled = fill_missing_values(df, fill_method)

    # Dummies
    df_dummies = pd.get_dummies(df_filled, 
                                columns=['flat_type', 'object_type', 'rooms', 'build_matireal',
                                        'district_rating','district','underground','eco_rating','clear_rating','gkh_rating',
                                        'neighbor_rating','kids_rating','sport_rest_rating','shop_rating',
                                        'traffic_rating','secure_rating','life_price_rating','metro_station',
                                        'num_of_metro_stations','num_of_kindg','num_of_schools','num_of_poly',
                                        'num_of_hospitals','num_of_dentists','num_of_women_cons'])

    X = df_dummies.drop(['page', 'description', 'price'], axis=1)
    y = df_dummies['price']
    
    # Normalization
    print(f'Normalization: {norm}')
    if norm == True:
        MMS = MinMaxScaler()
        X[['square', 'kitchen_square', 'live_square']] = MMS.fit_transform(X[['square', 'kitchen_square', 'live_square']])
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
    
    rd = Ridge(alpha=1.0)
    rd.fit(X_train, y_train)

    predict = rd.predict(X_test)
    (rmse, mae, r2, mape) = eval_metrics(y_test, predict)

    run["sys/tags"].add(['sklearn_LinearReg'])
    run['sk_LinearRegression/parameters'] = {'fill method': fill_method, 
                                             'normalization': norm,
                                             'squares': squares}
    run['sk_LinearRegression/rmse'] = np.round(rmse, 2)
    run['sk_LinearRegression/mae'] = np.round(mae, 2)
    run['sk_LinearRegression/r2'] = np.round(r2, 2)
    run['sk_LinearRegression/mape'] = np.round(mape, 2)
    run["sk_LinearRegression/model"].upload('sklearn_linear_regression_model.pkl')

    print(f'RMSE: {np.round(rmse, 2)} | MAE: {np.round(mae, 2)} | R2: {np.round(r2, 2)} | MAPE: {np.round(mape, 2)}')

    dump(lr, 'sklearn_linear_regression_model.pkl')

In [284]:
def objective(trial):
    # hyperparameter setting
    alpha = trial.suggest_uniform('alpha', 0.0, 2.0)
    
    # data loading and train-test split
    
    train = pd.read_csv('../data/train_knn.csv',
                    dtype={'floors': str,
                           'total_floors': str})
    
    # Dummies
    df_dummies = pd.get_dummies(train, 
                                columns=['flat_type', 'object_type', 'rooms', 'build_matireal',
                                        'district_rating','district','underground','eco_rating','clear_rating','gkh_rating',
                                        'neighbor_rating','kids_rating','sport_rest_rating','shop_rating',
                                        'traffic_rating','secure_rating','life_price_rating','metro_station',
                                        'num_of_metro_stations','num_of_kindg','num_of_schools','num_of_poly',
                                        'num_of_hospitals','num_of_dentists','num_of_women_cons'])
    
    X = df_dummies.drop(['page', 'description', 'price'], axis=1)
    y = df_dummies['price']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
    
    # model training and evaluation
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    error = mean_absolute_percentage_error(y_test, y_pred)

    # output: evaluation score
    return error

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[32m[I 2021-06-20 19:38:18,105][0m A new study created in memory with name: no-name-56d2915f-f256-4c76-9065-dcae28ca38c8[0m
[32m[I 2021-06-20 19:38:24,030][0m Trial 0 finished with value: 0.2284226887126013 and parameters: {'alpha': 0.42710692713631815}. Best is trial 0 with value: 0.2284226887126013.[0m
[32m[I 2021-06-20 19:38:28,854][0m Trial 1 finished with value: 0.2284653956649313 and parameters: {'alpha': 0.21158442893373963}. Best is trial 0 with value: 0.2284226887126013.[0m
[32m[I 2021-06-20 19:38:33,880][0m Trial 2 finished with value: 0.2284261754037365 and parameters: {'alpha': 0.4093918790298692}. Best is trial 0 with value: 0.2284226887126013.[0m
[32m[I 2021-06-20 19:38:39,021][0m Trial 3 finished with value: 0.22813863595918915 and parameters: {'alpha': 1.9889319182804694}. Best is trial 3 with value: 0.22813863595918915.[0m
[32m[I 2021-06-20 19:38:44,150][0m Trial 4 finished with value: 0.2281524641604035 and parameters: {'alpha': 1.908496675696861}. Be