In [1]:
import pickle

In [2]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)

# Imputing missing values
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import tree


Загружаем финальную модель, объект SimpleImpter, данные признаков.

In [3]:
# load pickle files

model_pkl_file = "pkl/model.pkl"
imputer_pkl_file = "pkl/imputer.pkl"
features_pkl_file = "pkl/features.pkl"

with open(model_pkl_file, 'rb') as file:
    model = pickle.load(file)

with open(imputer_pkl_file, 'rb') as file:
    imputer = pickle.load(file)

with open(features_pkl_file, 'rb') as file:
    features_names = pickle.load(file)

Загружаем данные квартир без евроремонта, для расчета цены если бы был евроремонт.

In [4]:
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/euro_renovation.xlsx')

# Replace all occurrences of Not Available with numpy not a number
data_flat = data.replace({'': np.nan, 'None': np.nan})

In [5]:
for col in list(data_flat.columns):
    # Select columns that should be numeric
    if ('price' in col or 'area' in col or 'height' in col or 'owner_count' in col or 'lat' in col or 'lon' in col):
        # Convert the data type to float
        data_flat[col] = data_flat[col].replace(regex={',': '.'}).astype(float)
    elif 'rooms' == col:
        # If no data about amount of rooms set to 1.
        data_flat[col] = data_flat[col].replace(regex={np.nan: 1}).astype(int)
    elif 'build_year' == col:
        # If no data about year of building set to mean year.
        data_flat[col] = data_flat[col].fillna(np.round(data_flat[col].mean()))
        data_flat[col] = data_flat[col].astype(int)
    elif 'all_data.house.has_garbage_disposer' == col:
        # If no data about garbage disposer set to False.
        data_flat[col] = data_flat[col].replace(regex={np.nan: False})
    elif 'has_gas' == col:
        # If no data about gas set to False.
        data_flat[col] = data_flat[col].replace(regex={np.nan: False})

Заменяем признак тип ремонта, на евроремонт у всех квартир. И создаем данные цены квартиры по цене квадрата и площади.

In [6]:
data_flat = data_flat.assign(renovation='Евроремонт')
price = data_flat.apply(lambda row: row.price_sq * row.area, axis=1)

In [7]:
# Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * mis_val / len(df)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

        # Return the dataframe with missing information
        return mis_val_table_ren_columns

# Get the columns with > 50% missing
missing_df = missing_values_table(data_flat)
missing_columns = list(missing_df[missing_df['% of Total Values'] > 50].index)
print('We will remove %d columns.' % len(missing_columns))
# Drop the columns
data_flat = data_flat.drop(columns = list(missing_columns))

Your selected dataframe has 61 columns.
There are 38 columns that have missing values.
We will remove 30 columns.


In [8]:
# Change build_year to 2023 of apartment with build_year=23
data_flat['build_year'] = data_flat['build_year'].replace(regex={23: 2023})

In [20]:
# Select the numeric columns
numeric_subset = data_flat.select_dtypes('number')

# Create columns with log of numeric columns
for col in numeric_subset.columns:
    # Skip the Energy Star Score column
    if col == 'price':
        next
    else:
        numeric_subset['log_' + col] = np.log(numeric_subset[col])

# Select the categorical columns
categorical_subset = data_flat[['city', 'house_wall_type', 'renovation', 'is_apartment', 'has_gas', 'sale_type',]]

# One hot encode
categorical_subset = pd.get_dummies(categorical_subset)

# Join the two dataframes using concat
# Make sure to use axis = 1 to perform a column bind
features = pd.concat([numeric_subset, categorical_subset], axis = 1)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [11]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320 entries, 0 to 1319
Data columns (total 60 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    1320 non-null   int64  
 1   offer_views                           1320 non-null   int64  
 2   lat                                   1320 non-null   float64
 3   lon                                   1320 non-null   float64
 4   price_sq                              1320 non-null   float64
 5   area                                  1320 non-null   float64
 6   floor                                 1320 non-null   int64  
 7   kitchen_area                          1320 non-null   float64
 8   balconies                             1320 non-null   int64  
 9   rooms                                 1320 non-null   int64  
 10  house_floors                          1320 non-null   int64  
 11  lifts            

Для того, чтобы модель работала нормально, количество и порядок признаков должен соответствовать параметрам признаков тренировочных данных.

In [21]:
features.drop(columns=features.columns[features.columns.duplicated()], inplace=True)

# Drop columns if no match with training data columns.
for col in features.columns:
    if col not in features_names.feature.values:
        features = features.drop(columns = [col,])

# Append columns if no match with training data columns.
for col in features_names.feature:
    if col not in features.columns:
        features[col] = 0

features = features[features_names.feature.values]

for index, dtype in enumerate(features_names.dtype.values):
    col = features.columns[index]
    features[col] = features[col].astype(dtype)

In [22]:
# Transform data
features = features.replace({np.inf: np.nan, -np.inf: np.nan})
features = imputer.transform(features)

Оставляем наиболее важные признаки.

In [24]:
indices = [5, 4, 3, 2, 11, 6, 0, 13, 10, 1]
features_reduced = features[:, indices]

Создаем данные прогнозирования.

In [25]:
data_flat_pred = model.predict(features_reduced)

Добавляем в датафрейм столбцы цен рассчитаных по цене квадрата и площади квартиры и цен спрогназированные моделью.

In [27]:
data['price'] = price
data['price_pred'] = data_flat_pred

Сохраняем датафрейм.

In [28]:
data.to_excel('data/flat_pred.xlsx')