# IMPORT

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

from ipywidgets import widgets

from pandas_profiling import ProfileReport

# PREPARETION

In [4]:
# train
train = pd.read_csv('../data/train_clean.csv',
                    dtype={'floors': str,
                           'total_floors': str})
# test
test = pd.read_csv('../data/test_clean.csv',
                    dtype={'floors': str,
                           'total_floors': str})

In [5]:
df = train.append(test, sort=False).reset_index(drop=True)

**Дропаем не интересующие нас признаки.**

In [6]:
df.drop(columns=['page'], inplace=True)

# all data

Создаем отчет обо всех данных (тренировочные и тестовые), без предворительной фильтрации.

In [8]:
profile = ProfileReport(df, title="Flats without filters")

In [9]:
# Выводим результат в notebook.
# profile.to_widgets()

In [10]:
# Сохраняем рузльтат в HTML.
profile.to_file("EDA_FLATS_PANDAS_PROFILING_REPORT.html")

Summarize dataset:   0%|          | 0/40 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# < 400 m2

In [28]:
df_400m2 = df[(df['square']<401) & (df['price']<100000000)].reset_index(drop=True)

In [29]:
profile_400m2 = ProfileReport(df_400m2, title="Flats < 400m2")

In [30]:
# Выводим результат в notebook.
# profile_400m2.to_widgets()

In [31]:
# Сохраняем рузльтат в HTML.
profile_400m2.to_file("EDA_FLATS_400M2_PANDAS_PROFILING_REPORT.html")

Summarize dataset:   0%|          | 0/40 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# < 130 m2

In [32]:
df_130m2 = df[(df['square']<131) & (df['price']<100000000)].reset_index(drop=True)

In [34]:
profile_130m2 = ProfileReport(df_130m2, title="Flats < 130m2")

In [35]:
# Выводим результат в notebook.
# profile_130m2.to_widgets()

In [36]:
# Сохраняем рузльтат в HTML.
profile_130m2.to_file("EDA_FLATS_130M2_PANDAS_PROFILING_REPORT.html")

Summarize dataset:   0%|          | 0/40 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Optimized categorical features

In [1]:
cat_features = ['flat_type', 'object_type', 'rooms', 'floors',
                'build_matireal', 'district', 'total_floors', 
                'metro_station', 'underground'] 

In [7]:
# Удаляем все выше перечисленные категории.
# flat_type - Своб. планировка.
df_correct = df.drop(df[df['flat_type'] == 'Своб. планировка'].index)
# rooms - Своб. планировка, 7, 8, 9.
df_correct.drop(df_correct[(df_correct['rooms'] == 'Своб. планировка') | (df_correct['rooms'] == '7') | \
                   (df_correct['rooms'] == '8') | (df_correct['rooms'] == '9')].index, inplace=True)
# floors - 26 - 35.
df_correct.drop(df_correct[(df_correct['floors'] == '26') | (df_correct['floors'] == '27') | \
                   (df_correct['floors'] == '28') | (df_correct['floors'] == '29') | \
                   (df_correct['floors'] == '30') | (df_correct['floors'] == '31') | \
                   (df_correct['floors'] == '32') | (df_correct['floors'] == '33') | \
                   (df_correct['floors'] == '34') | (df_correct['floors'] == '35')].index, inplace=True)
# build_matireal - блоки, дерево.
df_correct.drop(df_correct[(df_correct['build_matireal'] == 'Блоки') | \
                           (df_correct['build_matireal'] == 'Дерево')].index, inplace=True)
# total_floors - 1, 30 - 36.
df_correct.drop(df_correct[(df_correct['total_floors'] == '1') | (df_correct['total_floors'] == '30') | \
                   (df_correct['total_floors'] == '31') | (df_correct['total_floors'] == '32') | \
                   (df_correct['total_floors'] == '33') | (df_correct['total_floors'] == '34') | \
                   (df_correct['total_floors'] == '35') | (df_correct['total_floors'] == '36')].index, inplace=True)
# metro_station - 5км, Пискаревка.
df_correct.drop(df_correct[(df_correct['metro_station'] == '5 км') | \
                           (df_correct['metro_station'] == 'Пискарёвка')].index, inplace=True)

## < 400 m2

In [8]:
df_correct_400m2 = df_correct[(df_correct['square']<401) & (df_correct['price']<100000000)].reset_index(drop=True)

In [9]:
profile_cat_correct_400m2 = ProfileReport(df_correct_400m2, title="Correct categorical flats < 400m2")

In [10]:
# Сохраняем рузльтат в HTML.
profile_cat_correct_400m2.to_file("EDA_CAT_COR_FLATS_400M2_PANDAS_PROFILING_REPORT.html")

Summarize dataset:   0%|          | 0/40 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## < 130 m2

In [11]:
df_correct_130m2 = df_correct[(df_correct['square']<131) & (df_correct['price']<100000000)].reset_index(drop=True)

In [12]:
profile_cat_correct_130m2 = ProfileReport(df_correct_130m2, title="Correct categorical flats < 130m2")

In [13]:
# Сохраняем рузльтат в HTML.
profile_cat_correct_130m2.to_file("EDA_CAT_COR_FLATS_130M2_PANDAS_PROFILING_REPORT.html")

Summarize dataset:   0%|          | 0/40 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]