# IMPORT

In [41]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

In [42]:
# train с новыми признаками
train = pd.read_csv('../data/train_new_features.csv',
                    dtype={'floors': str,
                           'total_floors': str})

# test с новыми признаками
test = pd.read_csv('../data/test_new_features.csv',
                    dtype={'floors': str,
                           'total_floors': str})

# FILL NaN

In [43]:
def fill_missing_values(data, fill_method='mean'):
    
    """ 
        Заполняет пропуски четырмя способами: mean, median, KNN, MICE. 
    
        data - датафрейм, который нужно обработать.
        fill_method - метод заполнения пропусков (mean, median, knn, mice)
        
        output: data with fiiled NaN's.
    """
    
    print('--------------- Fill NaN values ---------------')
    print('\n')
    
    if fill_method == '  mean':
    
        # Mean
        print('  mean processing...')
        df_mean = data.copy()
        imp_mean = SimpleImputer(strategy='mean')
        y_mean = np.round(imp_mean.fit_transform(df_mean[['kitchen_square', 'live_square']]), 1)
        df_mean['kitchen_square'] = pd.DataFrame(y_mean)[0]
        df_mean['live_square'] = pd.DataFrame(y_mean)[1]
        print('  Done.')
        
        return df_mean

    elif fill_method == '  median':
        
        # Median
        print('median processing...')
        df_median = data.copy()
        imp_median = SimpleImputer(strategy='median')
        y_median = np.round(imp_median.fit_transform(df_median[['kitchen_square', 'live_square']]), 1)
        df_median['kitchen_square'] = pd.DataFrame(y_median)[0]
        df_median['live_square'] = pd.DataFrame(y_median)[1]
        print('  Done.')
        
        return df_median
    
    elif fill_method == '  knn':

        # KNN
        print('  KNN processing...')
        df_knn = data.copy()
        x_knn = df_knn[['square', 'kitchen_square', 'live_square']]
        imp_knn = KNNImputer(n_neighbors=5)
        imp_knn.fit(x_knn)
        x_knn = np.round(imp_knn.transform(x_knn), 1)
        y_knn = pd.DataFrame(x_knn)
        df_knn['kitchen_square'] = y_knn[1]
        df_knn['live_square'] = y_knn[2]
        print('  Done.')
        
        return df_knn
    
    elif fill_method == '  mice':

        # MICE
        print('  MICE processing...')
        df_mice = data.copy()
        x_mice = df_mice[['square', 'kitchen_square', 'live_square']]
        mice_imp = IterativeImputer(max_iter=10, random_state=42)
        mice_imp.fit(x_mice)
        x_mice = np.round(mice_imp.transform(x_mice), 1)
        y_mice = pd.DataFrame(x_mice)
        df_mice['kitchen_square'] = y_mice[1]
        df_mice['live_square'] = y_mice[2]
        print('  Done.')
        
        return df_mice

In [47]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    return rmse, mae, r2, mape

In [55]:
def mlflow_sklearn_reg(data, fill_method = 'mean', normalization = False):
    
    df = data.copy()

    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    
    # Fill NaN
    df_filled = fill_missing_values(df, fill_method = fill_method)
    
    onehot_encoder = OneHotEncoder(sparse=False)

    df_ohe = pd.get_dummies(df_filled, 
                            columns=['flat_type', 'object_type', 'rooms', 'build_matireal',
                                    'district_rating','district','underground','eco_rating','clear_rating','gkh_rating',
                                    'neighbor_rating','kids_rating','sport_rest_rating','shop_rating',
                                    'traffic_rating','secure_rating','life_price_rating','metro_station',
                                    'num_of_metro_stations','num_of_kindg','num_of_schools','num_of_poly',
                                    'num_of_hospitals','num_of_dentists','num_of_women_cons'])
    
    # Normalization
    if normalization == True:
        MMS = MinMaxScaler()
        MMS.fit(df_ohe[['square', 'kitchen_square', 'live_square', 'price']])
        df_ohe[['square', 'kitchen_square', 'live_square', 'price']] = MMS.transform(df_ohe[['square', 'kitchen_square', 'live_square', 'price']])


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(df_ohe)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["price", 'page', 'description'], axis=1)
    test_x = test.drop(["price", 'page', 'description'], axis=1)
    train_y = train[["price"]]
    test_y = test[["price"]]

    with mlflow.start_run():
        lr = LinearRegression()
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)
        
        if normalization == True:
            (rmse, mae, r2, mape) = eval_metrics(np.expm1(test_y), np.expm1(predicted_qualities)
        else:
            (rmse, mae, r2, mape) = eval_metrics(test_y, predicted_qualities)
            
        print('\n')
        print('--------------- Training ---------------')
        print('\n')
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        print("  MAPE: %s" % mape)
        
        metrics = {'RMSE': rmse, 'MAE': mae, 'R2': r2, 'MAPE': mape}
        params = {"Fill NaN method": fill_method, 'Normalization': normalization}
        
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)

        mlflow.sklearn.log_model(lr, "model")

# TESTS

In [56]:
mlflow_sklearn_reg(train, 'mean')

--------------- Fill NaN values ---------------
mean processing...
Done.


--------------- Training ---------------


  RMSE: 39961812.32373111
  MAE: 2422797.1729404386
  R2: 0.05190250340265001
  MAPE: 0.2226219716846474


In [57]:
mlflow_sklearn_reg(train, 'mean', True)

--------------- Fill NaN values ---------------
mean processing...
Done.


--------------- Training ---------------


  RMSE: 39961812.32373056
  MAE: 2422797.1729396917
  R2: 0.05190250340267599
  MAPE: 0.2226219716845226


In [58]:
mlflow_sklearn_reg(train, 'median')

--------------- Fill NaN values ---------------
median processing...
Done.


--------------- Training ---------------


  RMSE: 39964455.422399044
  MAE: 2422077.312322646
  R2: 0.051777083760446985
  MAPE: 0.22224425116766455


In [59]:
mlflow_sklearn_reg(train, 'median', True)

--------------- Fill NaN values ---------------
median processing...
Done.


--------------- Training ---------------


  RMSE: 39964455.422399014
  MAE: 2422077.3123226482
  R2: 0.05177708376044832
  MAPE: 0.22224425116767674


In [60]:
mlflow_sklearn_reg(train, 'knn')

--------------- Fill NaN values ---------------
KNN processing...
Done.


--------------- Training ---------------


  RMSE: 40012138.60288686
  MAE: 2473446.8296664166
  R2: 0.049513008974006456
  MAPE: 0.22980487769683852


In [61]:
mlflow_sklearn_reg(train, 'knn', True)

--------------- Fill NaN values ---------------
KNN processing...
Done.


--------------- Training ---------------


  RMSE: 40012138.60288638
  MAE: 2473446.829665861
  R2: 0.04951300897402944
  MAPE: 0.22980487769675095


In [62]:
mlflow_sklearn_reg(train, 'mice')

--------------- Fill NaN values ---------------
MICE processing...
Done.


--------------- Training ---------------


  RMSE: 39930781.28446545
  MAE: 2437017.324384338
  R2: 0.05337435996719486
  MAPE: 0.22522010786948213


In [63]:
mlflow_sklearn_reg(train, 'mice', True)

--------------- Fill NaN values ---------------
MICE processing...
Done.


--------------- Training ---------------


  RMSE: 39930781.28446485
  MAE: 2437017.3243835866
  R2: 0.053374359967223284
  MAPE: 0.22522010786935204
