**PROJECT REAL EASTATE MOSCOW**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
matplotlib.rcParams.update({'font.size': 14})

In [5]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2_score(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2_score(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(10,5))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

**Пути к директориям и файлам**

In [6]:
TRAIN_DATASET_PATH = '../input/real-estate-price-prediction-moscow/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction-moscow/test.csv'


### Загрузка данных <a class='anchor' id='load'>

**Описание датасета**

* **Id** - идентификационный номер квартиры
* **DistrictId** - идентификационный номер района
* **Rooms** - количество комнат
* **Square** - площадь
* **LifeSquare** - жилая площадь
* **KitchenSquare** - площадь кухни
* **Floor** - этаж
* **HouseFloor** - количество этажей в доме
* **HouseYear** - год постройки дома
* **Ecology_1, Ecology_2, Ecology_3** - экологические показатели местности
* **Social_1, Social_2, Social_3** - социальные показатели местности
* **Healthcare_1, Helthcare_2** - показатели местности, связанные с охраной здоровья
* **Shops_1, Shops_2** - показатели, связанные с наличием магазинов, торговых центров
* **Price** - цена квартиры

In [7]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df.head()

In [8]:
train_df.dtypes

In [9]:
test_df = pd.read_csv(TEST_DATASET_PATH)
test_df.head()

In [10]:
print('Appartments in train:', train_df.shape[0])
print('Appartments in test:', test_df.shape[0])

In [11]:
train_df.shape[1] - 1 == test_df.shape[1]

### Приведение типов

In [12]:
train_df.dtypes

In [13]:
train_df['Id'] = train_df['Id'].astype(str)
train_df['DistrictId'] = train_df['DistrictId'].astype(str)

## 1. EDA 

**Целевая переменная**

In [14]:
plt.figure(figsize = (10, 5))

train_df['Price'].hist(bins=20)
plt.ylabel('Count')
plt.xlabel('Price')

plt.title('Target distribution')
plt.show()

**Количественные переменные**

In [15]:
train_df.select_dtypes(include=['float', 'int']).describe()

**Номинативные переменные**

In [16]:
train_df.select_dtypes(include='object').describe()

In [17]:
print(train_df['DistrictId'].value_counts())
print(train_df['Ecology_2'].value_counts())
print(train_df['Ecology_3'].value_counts())
print(train_df['Shops_2'].value_counts())

**Change categorical to quantitative values**

In [18]:
train_df.replace({'Ecology_2': {'A': 0, 'B': 1}}, inplace=True)
train_df.replace({'Ecology_3': {'A': 0, 'B': 1}}, inplace=True)
train_df.replace({'Shops_2': {'A': 0, 'B': 1}}, inplace=True)

### 2. Обработка выбросов

**Rooms**

In [19]:
train_df['Rooms'].value_counts()

In [20]:
# Outliers - 2%

Rooms_max_value = np.quantile(train_df['Rooms'], q = 0.99)
print(f'Rooms_max_value = {Rooms_max_value}')

Rooms_min_value = np.quantile(train_df['Rooms'], q = 0.01)
print(f'Rooms_min_value = {Rooms_min_value}')

train_df.loc[(train_df['Rooms'] > Rooms_max_value), 'Rooms'] = Rooms_max_value
train_df.loc[(train_df['Rooms'] < Rooms_min_value), 'Rooms'] = Rooms_min_value

In [21]:
train_df['Rooms'].value_counts()

**KitchenSquare** 

In [22]:
train_df['KitchenSquare'].value_counts()

In [23]:
# Outliers max 1%

KitchenSquare_max_value = np.quantile(train_df['KitchenSquare'], q = 0.99)
print(f'KitchenSquare_max_value = {KitchenSquare_max_value}')

KitchenSquare_min_value = 3     # even small kitchen area should be 3m 
print(f'KitchenSquare_min_value = {KitchenSquare_min_value}')

train_df.loc[(train_df['KitchenSquare'] > KitchenSquare_max_value), 'KitchenSquare'] = KitchenSquare_max_value
train_df.loc[(train_df['KitchenSquare'].isna()) \
       | (train_df['KitchenSquare'] < KitchenSquare_min_value), 'KitchenSquare'] = KitchenSquare_min_value


In [24]:
train_df['KitchenSquare'].value_counts()

**HouseFloor, Floor**

In [25]:
train_df['HouseFloor'].sort_values().unique()

In [26]:
train_df['Floor'].sort_values().unique()

In [27]:
(train_df['Floor'] > train_df['HouseFloor']).sum()

In [28]:
train_df['HouseFloor_mistake'] = 0
train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor_mistake'] = 1
train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor_mistake'] = 1

In [29]:
train_df.loc[(train_df['HouseFloor'].isna()) | (train_df['HouseFloor'] == 0), 'HouseFloor'] = train_df['HouseFloor'].median()

In [30]:
floor_mistake = train_df.loc[train_df['Floor'] > train_df['HouseFloor']].index
floor_mistake

In [31]:
train_df.loc[floor_mistake, 'Floor'] = train_df.loc[floor_mistake, 'HouseFloor']\
                                                .apply(lambda x: random.randint(1, x)) 

In [32]:
(train_df['Floor'] > train_df['HouseFloor']).sum()

**HouseYear**

In [33]:
train_df['HouseYear'].sort_values(ascending=False)

In [34]:
train_df[(train_df['HouseYear'] > 2020) | (train_df['HouseYear'] < 1910)]

In [35]:
train_df.loc[8828, 'HouseYear'] = 1968                      # The best option for this time
train_df.loc[9163, 'HouseYear'] = np.mean([2005, 2011])

train_df.loc[(train_df['HouseYear'] > 2020) | (train_df['HouseYear'] < 1910), 'HouseYear'] = train_df['HouseYear'].median()

### 3. Обработка пропусков  <a class='anchor' id='nan'>

In [36]:
train_df.isna().sum()

In [37]:
train_df[['Square', 'LifeSquare']].head()

**LifeSquare**

In [38]:
train_df['LifeSquare_NaN'] = train_df['LifeSquare'].isna() * 1

condition = (train_df['LifeSquare'].isna()) \
             & (~train_df['Square'].isna()) \
             & (~train_df['KitchenSquare'].isna())
        
train_df.loc[condition, 'LifeSquare'] = train_df.loc[condition, 'Square'] \
                                            - train_df.loc[condition, 'KitchenSquare'] - 5

**Healthcare_1**

In [39]:
train_df.drop('Healthcare_1', axis=1, inplace=True) # too much of NaNs

In [40]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.kitchen_square_quantile = None
        
    def fit(self, X):
        """Сохранение статистик"""       
        # Расчет медиан
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(0.99)
    
    def transform(self, X):
        """Трансформация данных"""
        
        X.replace({'Ecology_2': {'A': 0, 'B': 1}}, inplace=True)
        X.replace({'Ecology_3': {'A': 0, 'B': 1}}, inplace=True)
        X.replace({'Shops_2': {'A': 0, 'B': 1}}, inplace=True)

        # Rooms        
        X.loc[X['Rooms'] == 0, 'Rooms'] = Rooms_min_value
        X.loc[X['Rooms'] >= 5, 'Rooms'] = Rooms_max_value
        
        # KitchenSquare
        X.loc[X['KitchenSquare'] > KitchenSquare_max_value, 'KitchenSquare'] = KitchenSquare_max_value
        X.loc[X['KitchenSquare'].isna() \
                     | (X['KitchenSquare'] < KitchenSquare_min_value), 'KitchenSquare'] = KitchenSquare_min_value
        
        # HouseFloor, Floor
        X['HouseFloor_mistake'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_mistake'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_mistake'] = 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        
        floor_mistake = X.loc[X['Floor'] > X['HouseFloor']].index
        X.loc[floor_mistake, 'Floor'] = X.loc[floor_mistake, 'HouseFloor']\
                                            .apply(lambda x: random.randint(1, x))
        
        # HouseYear
        X.loc[(X['HouseYear'] > 2020) | (X['HouseYear'] < 1910), 'HouseYear'] = X['HouseYear'].median()
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
            
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        condition = (X['LifeSquare'].isna()) & \
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 5
        
        
        X.fillna(self.medians, inplace=True)
        
        return X

### 4. Построение новых признаков  <a class='anchor' id='feature'>

**Avarage room area in an apartment**

In [41]:
train_df['room_square'] = train_df['LifeSquare'] / train_df['Rooms']
train_df[['Rooms', 'room_square']].describe().T

**HouseAge**

In [42]:
import datetime
now = datetime.datetime.now()

train_df['housing_median_age'] = now.year - train_df['HouseYear']

In [43]:
train_df['housing_median_age'].describe()

**Housing age category**

In [44]:
train_df['age_cat'] = 0

train_df.loc[(train_df['housing_median_age'] <= 5), 'age_cat'] = 1  
train_df.loc[(train_df['housing_median_age'] > 5) & (train_df['housing_median_age'] <= 10), 'age_cat'] = 2    # new
train_df.loc[(train_df['housing_median_age'] > 10) & (train_df['housing_median_age'] <= 25), 'age_cat'] = 3   # relatively new
train_df.loc[(train_df['housing_median_age'] > 25) & (train_df['housing_median_age'] <= 35), 'age_cat'] = 4   # 90s
train_df.loc[(train_df['housing_median_age'] > 35) & (train_df['housing_median_age'] <= 65), 'age_cat'] = 5   # probably panel houses and 'Хрущевки'
train_df.loc[(train_df['housing_median_age'] > 75), 'age_cat'] = 6   # historical

train_df.head().T

**Appartments floor category**

In [45]:
train_df['floor_cat'] = 0

train_df.loc[(train_df['Floor'] <= 3), 'floor_cat'] = 1  
train_df.loc[(train_df['Floor'] > 3) & (train_df['Floor'] <= 5), 'floor_cat'] = 2
train_df.loc[(train_df['Floor'] > 5) & (train_df['Floor'] <= 10), 'floor_cat'] = 3
train_df.loc[(train_df['Floor'] > 10) & (train_df['Floor'] <= 20), 'floor_cat'] = 4
train_df.loc[(train_df['Floor'] > 20), 'floor_cat'] = 5

**Functions for new features**

In [46]:
def room_square(X):
    X['room_square'] = X['LifeSquare'] / X['Rooms']
    
    return X

def housing_age(X):
    X['housing_median_age'] = now.year - X['HouseYear']
    
    return X
    
def age_to_cat(X):

    X['age_cat'] = 0
    X.loc[(X['housing_median_age'] <= 5), 'age_cat'] = 1  
    X.loc[(X['housing_median_age'] > 5) & (X['housing_median_age'] <= 10), 'age_cat'] = 2    
    X.loc[(X['housing_median_age'] > 10) & (X['housing_median_age'] <= 25), 'age_cat'] = 3   
    X.loc[(X['housing_median_age'] > 25) & (X['housing_median_age'] <= 35), 'age_cat'] = 4   
    X.loc[(X['housing_median_age'] > 35) & (X['housing_median_age'] <= 65), 'age_cat'] = 5  
    X.loc[(X['housing_median_age'] > 75), 'age_cat'] = 6

    return X
    

def floor_to_cat(X):

    X['floor_cat'] = 0
    X.loc[(X['Floor'] <= 3), 'floor_cat'] = 1  
    X.loc[(X['Floor'] > 3) & (X['Floor'] <= 5), 'floor_cat'] = 2
    X.loc[(X['Floor'] > 5) & (X['Floor'] <= 10), 'floor_cat'] = 3
    X.loc[(X['Floor'] > 10) & (X['Floor'] <= 20), 'floor_cat'] = 4
    X.loc[(X['Floor'] > 20), 'floor_cat'] = 5

    return X

### 5. Отбор признаков  <a class='anchor' id='feature_selection'>

In [47]:
train_df.columns

In [48]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',
                 'Helthcare_2', 'Shops_1', 'Shops_2']

new_feature_names = ['room_square', 'housing_median_age', 'age_cat', 'floor_cat']

target_name = 'Price'

### 6. Разбиение на train и test  <a class='anchor' id='split'>

In [49]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

X = train_df.drop(columns=target_name)
y = train_df[target_name]

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [51]:
preprocessor = DataPreprocessing()
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)
test_df = preprocessor.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [52]:
# FeatureGenetator

X_train = room_square(X_train)
X_train = housing_age(X_train)
X_train = age_to_cat(X_train)
X_train = floor_to_cat(X_train)

X_valid = room_square(X_valid)
X_valid = housing_age(X_valid)
X_valid = age_to_cat(X_valid)
X_valid = floor_to_cat(X_valid)

test_df = room_square(test_df)
test_df = housing_age(test_df)
test_df = age_to_cat(test_df)
test_df = floor_to_cat(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [53]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [54]:
X_train = reduce_mem_usage(X_train)
X_valid = reduce_mem_usage(X_valid)
test_df = reduce_mem_usage(test_df)

In [55]:
X_train = X_train[feature_names + new_feature_names]
X_valid = X_valid[feature_names + new_feature_names]
test_df = test_df[feature_names + new_feature_names]

In [56]:
X_train.head()

In [57]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()

### 7. Построение модели  <a class='anchor' id='modeling'>

**Обучение**

In [70]:
gb_model = GradientBoostingRegressor(criterion='mse',
                                     max_depth=5,
                                     min_samples_leaf=100,
                                     random_state=42,  
                                     n_estimators=400)

gb_model.fit(X_train, y_train)

y_train_preds = gb_model.predict(X_train)
y_test_preds = gb_model.predict(X_valid)

print(r2_score(y_train, y_train_preds))
print(r2_score(y_valid, y_test_preds))

**Оценка модели**

In [59]:
y_train_preds = gb_model.predict(X_train)
y_test_preds = gb_model.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)

**Кросс-валидация**

In [60]:
cv_score = cross_val_score(gb_model, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))
cv_score

In [61]:
cv_score.mean()

**Важность признаков**

In [62]:
feature_importances = pd.DataFrame(zip(X_train.columns, gb_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

Идея более сложных моделей:

catboost, lightgbm, xgboost

### 8. Прогнозирование на тестовом датасете  <a class='anchor' id='prediction'>

1. Выполнить для тестового датасета те же этапы обработки и постронияния признаков
2. Не потерять и не перемешать индексы от примеров при построении прогнозов
3. Прогнозы должны быть для все примеров из тестового датасета (для всех строк)

In [63]:
test_df.shape

In [64]:
test_df

In [65]:
submit = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/sample_submission.csv')
submit.head()

In [66]:
predictions = gb_model.predict(test_df)
predictions

In [67]:
submit['Price'] = predictions
submit.head()

In [68]:
submit.to_csv('gb_submit.csv', index=False)