# IMPORT

In [125]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# PREPARATION

In [126]:
# train с новыми признаками
train = pd.read_csv('../data/train_new_features.csv',
                    dtype={'floors': str,
                           'total_floors': str})

# test с новыми признаками
test = pd.read_csv('../data/test_new_features.csv',
                    dtype={'floors': str,
                           'total_floors': str})

In [127]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76393 entries, 0 to 76392
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   page                   76393 non-null  object 
 1   description            76393 non-null  object 
 2   flat_type              76393 non-null  object 
 3   object_type            76393 non-null  object 
 4   rooms                  76393 non-null  object 
 5   floors                 76393 non-null  object 
 6   square                 76393 non-null  float64
 7   kitchen_square         40537 non-null  float64
 8   live_square            44509 non-null  float64
 9   price                  76393 non-null  float64
 10  build_matireal         76393 non-null  object 
 11  district_rating        76393 non-null  object 
 12  district               76393 non-null  object 
 13  underground            76393 non-null  object 
 14  eco_rating             76393 non-null  object 
 15  cl

In [128]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32738 entries, 0 to 32737
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   page                   32738 non-null  object 
 1   description            32738 non-null  object 
 2   flat_type              32738 non-null  object 
 3   object_type            32738 non-null  object 
 4   rooms                  32738 non-null  object 
 5   floors                 32738 non-null  object 
 6   square                 32738 non-null  float64
 7   kitchen_square         17338 non-null  float64
 8   live_square            18980 non-null  float64
 9   price                  32738 non-null  float64
 10  build_matireal         32738 non-null  object 
 11  district_rating        32738 non-null  object 
 12  district               32738 non-null  object 
 13  underground            32738 non-null  object 
 14  eco_rating             32738 non-null  object 
 15  cl

In [129]:
train['sample'] = 1 # тренировочные
test['sample'] = 0 # тестовые

df = train.append(test, sort=False).reset_index(drop=True)

# Fill NaN

**Методы заполнения пропусков:**

1. Mean
2. Median
3. KNN
4. MICE

In [130]:
def split_df(df):
    
    """ Разбивает датафрейм на тренировчный и тестовый. """
    
    df_temp = df.copy()
    
    # Train
    train = df_temp[df_temp['sample']==1].copy()
    train.drop(columns=['sample'], inplace=True)
    
    # Test
    test = df_temp[df_temp['sample']==0].copy()
    test.drop(columns=['sample'], inplace=True)
    
    return train, test

In [131]:
def fill_missing_values(train_data, test_data):
    
    """ 
        Заполняет пропуски четырмя способами: mean, median, KNN, MICE. 
    
        input: train data, test data.
        
        output: train_mean, train_median, train_knn, train_mice, 
                test_mean, test_median, test_knn, test_mice.
    """
    
    # concat data
    train_data['sample'] = 1 # тренировочные
    test_data['sample'] = 0 # тестовые

    df = train_data.append(test_data, sort=False).reset_index(drop=True)
    
    # Mean
    print('mean processing...')
    df_mean = df.copy()
    imp_mean = SimpleImputer(strategy='mean')
    y_mean = np.round(imp_mean.fit_transform(df_mean[['kitchen_square', 'live_square']]), 1)
    df_mean['kitchen_square'] = pd.DataFrame(y_mean)[0]
    df_mean['live_square'] = pd.DataFrame(y_mean)[1]
    
    # Median
    print('median processing...')
    df_median = df.copy()
    imp_median = SimpleImputer(strategy='median')
    y_median = np.round(imp_median.fit_transform(df_median[['kitchen_square', 'live_square']]), 1)
    df_median['kitchen_square'] = pd.DataFrame(y_median)[0]
    df_median['live_square'] = pd.DataFrame(y_median)[1]
    
    # KNN
    print('KNN processing...')
    df_knn = df.copy()
    x_knn = df_knn[['square', 'kitchen_square', 'live_square']]
    imp_knn = KNNImputer(n_neighbors=5)
    imp_knn.fit(x_knn)
    x_knn = np.round(imp_knn.transform(x_knn), 1)
    y_knn = pd.DataFrame(x_knn)
    df_knn['kitchen_square'] = y_knn[1]
    df_knn['live_square'] = y_knn[2]
    
    # MICE
    print('MICE processing...')
    df_mice = df.copy()
    x_mice = df_mice[['square', 'kitchen_square', 'live_square']]
    mice_imp = IterativeImputer(max_iter=10, random_state=42)
    mice_imp.fit(x_mice)
    x_mice = np.round(mice_imp.transform(x_mice), 1)
    y_mice = pd.DataFrame(x_mice)
    df_mice['kitchen_square'] = y_mice[1]
    df_mice['live_square'] = y_mice[2]
    
    # Export Data
    
    train_mean, test_mean = split_df(df_mean)
    train_median, test_median = split_df(df_median)
    train_knn, test_knn = split_df(df_knn)
    train_mice, test_mice = split_df(df_mice)
    print('Done.')
    
    return train_mean, train_median, train_knn, train_mice, test_mean, test_median, test_knn, test_mice

In [132]:
(train_mean, train_median, train_knn, train_mice, 
 test_mean, test_median, test_knn, test_mice) = fill_missing_values(train, test)

mean processing...
median processing...
KNN processing...
MICE processing...
Done.


# EXPORT DATA

In [136]:
train_mean.to_csv('../data/train_mean.csv', index=False)
train_median.to_csv('../data/train_median.csv', index=False) 
train_knn.to_csv('../data/train_knn.csv', index=False) 
train_mice.to_csv('../data/train_mice.csv', index=False) 
test_mean.to_csv('../data/test_mean.csv', index=False) 
test_median.to_csv('../data/test_median.csv', index=False) 
test_knn.to_csv('../data/test_knn.csv', index=False) 
test_mice.to_csv('../data/test_mice.csv', index=False)