<a href="https://colab.research.google.com/github/asya474/final_project/blob/main/final_project_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Основные цели и задачи проекта

Основной целью проекта является создание сервиса для предсказания стоимости домов, используя накопленные данные о сделках. Используемый алгоритм должен быть достаточно быстрым и эффективным с точки зрения выбранной метрики.
Выбрана метрика MAPE для поставленной задачи регрессии, так как в данных в основном представлены категориальные признаки, в поставленной задаче отсутствует необходимость применять сильно большие штрафы к модели за неправильно обработанные данные и так же относительная метрика более проста и удобна для оценки качества модели, что и пригодится нам для построения безлайна для соответствующего сервиса.

Краткая информация о данных

Датасет, используемый в проекте, представляет из себя обычные табличные данные, и состоит из следующих столбцов: • status статус сделки • private pool наличие частного бассейна • propertyType тип собственности с информацией о количестве комнат и стиле интерьера • street адрес • baths количество бассейнов • homeFacts информация о здании • fireplace наличие камина • city город • schools информация о школах в округе • sqft площадь жилья в квадратных футах • zipcode почтовый индекс • beds количество кроватей и площадь спальни • state штат • stories подсобные помещения • mls-id мультилистинг • PrivatePool наличие частного бассейна • MlsId мультилистинг • target стоимость жилья

Этапы работы над проектом

1. Знакомство с датасетом 
2.Выбор метрики, которая отвечала бы бизнес-целям. 
3.Очистка данных, заполнение пропусков 
4.Стандартизация числовых столбцов, применением методов label-encoding, get_dummies к категориальным переменным.
5.Выбор алгоритмов машинного обучения,Я оптимизация гиперпараметров, выбор лучшей модели

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import zipfile
import time
import csv
import sys
import os
!pip install tensorflow_addons -q
import tensorflow_addons as tfa
import re

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import *
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import *
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
import PIL
from PIL import ImageOps, ImageFilter
#увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
#графики в svg выглядят более четкими
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
import seaborn as sns
#print(os.listdir("/content/drive/MyDrive/Colab Notebooks/input"))


In [2]:
RANDOM_SEED = 42
VAL_SIZE = 0.20 
mape=mean_absolute_percentage_error #в качестве метрики выбрано МАРЕ, 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## EDA

In [4]:
zip_file='/content/drive/MyDrive/data.csv.zip'
z=zipfile.ZipFile(zip_file, 'r')
z.extractall()
print(os.listdir())

['.config', '__MACOSX', 'data.csv', 'drive', 'sample_data']


In [5]:
data=pd.read_csv('data.csv') #загрузка датасета

In [6]:
data.info() #вывод основной информации о датасете

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 377185 entries, 0 to 377184
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   status        337267 non-null  object
 1   private pool  4181 non-null    object
 2   propertyType  342452 non-null  object
 3   street        377183 non-null  object
 4   baths         270847 non-null  object
 5   homeFacts     377185 non-null  object
 6   fireplace     103115 non-null  object
 7   city          377151 non-null  object
 8   schools       377185 non-null  object
 9   sqft          336608 non-null  object
 10  zipcode       377185 non-null  object
 11  beds          285903 non-null  object
 12  state         377185 non-null  object
 13  stories       226470 non-null  object
 14  mls-id        24942 non-null   object
 15  PrivatePool   40311 non-null   object
 16  MlsId         310305 non-null  object
 17  target        374704 non-null  object
dtypes: object(18)
memory usa

In [7]:
data.nunique() #вывод уникальных значений

status             159
private pool         1
propertyType      1280
street          337076
baths              229
homeFacts       321009
fireplace         1653
city              2026
schools         297365
sqft             25405
zipcode           4549
beds              1184
state               39
stories            348
mls-id           24907
PrivatePool          2
MlsId           232944
target           43939
dtype: int64

In [8]:
data.isna().sum() #проверка на содержание NaN в столбце

status           39918
private pool    373004
propertyType     34733
street               2
baths           106338
homeFacts            0
fireplace       274070
city                34
schools              0
sqft             40577
zipcode              0
beds             91282
state                0
stories         150715
mls-id          352243
PrivatePool     336874
MlsId            66880
target            2481
dtype: int64

### target

In [9]:
data.target.value_counts() #обзор значений в столбце 

$225,000     1462
$275,000     1355
$250,000     1312
$350,000     1296
$299,900     1276
             ... 
$390,359        1
$458,900+       1
274,359         1
$273,490+       1
$171,306        1
Name: target, Length: 43939, dtype: int64

In [10]:
data['new_target'] = data['target'].str.replace('+', '') #удаляем + из значений столбца

  """Entry point for launching an IPython kernel.


In [11]:
data['new_target'] = data['new_target'].str.replace('$', '') #удаляем $ из значений столбца

  """Entry point for launching an IPython kernel.


In [12]:
data['new_target'] = data['new_target'].str.replace(',', '') #удаляем , из значений столбца

In [13]:
data['new_target']=pd.to_numeric(data['new_target'], downcast='integer', errors='coerce') #приводим значения к типу integer

In [14]:
data['new_target'].value_counts() #обзор значений в столбце

225000.0    1806
275000.0    1650
250000.0    1644
350000.0    1641
325000.0    1562
            ... 
218111.0       1
136466.0       1
236070.0       1
752911.0       1
171306.0       1
Name: new_target, Length: 34184, dtype: int64

In [15]:
data.drop(['target',], axis=1, inplace=True) #удаляем старый столбец с таргетом и грязными данными, будем в дальнейшем использовать новый

In [16]:
data.dropna(subset=['new_target'], inplace=True) #удаляем строки, содержащие NaN

### status

In [17]:
data.status.value_counts() #краткий обзор значений

for sale                   156100
Active                     105206
For sale                    43465
foreclosure                  5677
New construction             5459
                            ...  
Coming soon: Nov 8.             1
Coming soon: Oct 29.            1
Coming soon: Dec 15.            1
Pending W/Backup Wanted         1
Coming soon: Dec 23.            1
Name: status, Length: 155, dtype: int64

In [18]:
data.status.unique() #вывод уникальных значений, как видим из разнообразия представленных типов можно вычленить более унифицированные. Попробуем вычленить основные типы с помощью дальнейших преобразований

array(['Active', 'for sale', nan, 'New construction', 'New', 'For sale',
       'Pending', 'P', 'Active/Contingent', ' / auction',
       'Under Contract', 'Pre-foreclosure / auction',
       'Under Contract   Showing', 'Pre-foreclosure',
       'Under Contract Backups', 'Active Under Contract', 'foreclosure',
       'Foreclosed', 'Option Pending', 'Under Contract Show', 'Auction',
       'A Active', 'Contingent', 'Pending   Continue To Show',
       'Price Change', 'Back on Market', 'Active Option', 'Foreclosure',
       'Coming soon: Nov 21.', 'Contingent Finance And Inspection',
       'Coming soon: Dec 4.', 'P Pending Sale', 'Coming soon: Nov 23.',
       'Active With Contingencies', 'Pending Ab', 'Pf', 'Contingent Show',
       'Contract P', 'Contingent Take Backup', 'Apartment for rent',
       'Backup Contract', 'Option Contract', 'Pending Continue To Show',
       'pending', 'Pending Inspection', 'Active Option Contract', 'C',
       'Auction - Active', 'Contingent   Show', 'Pi

In [19]:
data['status'] = data.status.astype(str) #приводим к типу строка, понижаем регистр и берем первое значение строки 
data['status'] = data.status.apply(lambda x: x.split(' ')[0].lower())

In [20]:
data.status.unique() #обзор получившихся уникальных значений

array(['active', 'for', 'nan', 'new', 'pending', 'p', 'active/contingent',
       '', 'under', 'pre-foreclosure', 'foreclosure', 'foreclosed',
       'option', 'auction', 'a', 'contingent', 'price', 'back', 'coming',
       'pf', 'contract', 'apartment', 'backup', 'c', 'pi', 'due', 'u',
       'lease/purchase', 'offer', 'listing', 'pending,', 'contingency',
       'condo', 'ct', 'temporary', 'closed', 'accepted', 'conditional',
       'accepting', 'conting', 'ps', 're', 'reactivated', 'uc'],
      dtype=object)

In [21]:
def transform_status(
    value: str
):  
    if isinstance(value, str):
        if  value in ['for']:
            return 'for sale'
        if  value in ['nan', '']:
            return 'not known'
        if  value in ['a', 'active/contingent']:
            return 'active'
        if  value in ['price']:
            return 'price change'
        if  value in ['back', 'backup']:
            return 'back on market'
        if  value in ['coming', 'c']:
            return 'coming soon'
        if value in ['p', 'pi', 'ps']:
            return 'pending'
        if value in ['u', 'uc']:
            return 'under'
        if value in ['re']:
            return 'reactivated'
        if value in ['ct']:
            return 'contract'
        if value in ['pre-foreclosure', 'foreclosure', 'foreclosed']:
            return 'foreclosure'
        if value in ['accepting', 'accepted']:
            return 'accepted'
        if value in ['contingency', 'conting']:
            return 'contingency'
        if value in ['condo', 'conditional']:
            return 'conditional'

    return value
data.status=data.status.apply(transform_status) #функция для преобразования, так как видно,что некоторые аббревиатуры используются для более полных категорий,например

In [22]:
data.status.unique() #обзор получившихся в итоге уникальных значений

array(['active', 'for sale', 'not known', 'new', 'pending', 'under',
       'foreclosure', 'option', 'auction', 'contingent', 'price change',
       'back on market', 'coming soon', 'pf', 'contract', 'apartment',
       'due', 'lease/purchase', 'offer', 'listing', 'pending,',
       'contingency', 'conditional', 'temporary', 'closed', 'accepted',
       'reactivated'], dtype=object)

### private pool

In [23]:
data['private pool'].value_counts() #обзор значений в столбце

Yes    4151
Name: private pool, dtype: int64

In [24]:
data.drop(['private pool',], axis=1, inplace=True) #удаляем столбец,так как значения совпадает со столбцом PrivatePool

### propertyType

In [25]:
data['propertyType'].value_counts() #обзор значений в столбце

single-family home                                             91159
Single Family                                                  61888
Single Family Home                                             31725
condo                                                          25878
lot/land                                                       20532
                                                               ...  
1 Story, Contemporary, Other (See Remarks)                         1
Custom, Elevated, Other                                            1
Contemporary, Farmhouse                                            1
2 Stories, Traditional, Mediterranean, Texas Hill Country          1
Bilevel, Converted Dwelling, Loft with Bedrooms, Condo/Unit        1
Name: propertyType, Length: 1279, dtype: int64

In [26]:
data['propertyType'].unique() #обзор уникальных значений

array(['Single Family Home', 'single-family home', 'lot/land', ...,
       'Cabin/Lodge, Contemporary, Converted Barn, Cottage, Loft with Bedrooms, Log Home, Post & Beam',
       'Lake House, Single Detached, Tudor',
       'Bilevel, Converted Dwelling, Loft with Bedrooms, Condo/Unit'],
      dtype=object)

In [27]:
data['propertyType'] = data['propertyType'].astype(str) #преобразование значений в строках
data['propertyType'] = data['propertyType'].apply(lambda x: x.split(' ')[0].lower().replace(',', ''))

In [28]:
data['propertyType'].unique()
#требуется более точечная ифна с этим сериес,чем просто обрезка данных, тут и данные о стиле интерьера, и данные о колве комнат

array(['single', 'single-family', 'lot/land', 'townhouse', 'florida',
       'nan', 'coop', 'english', '2', 'multi-family', 'penthouse',
       'condo', 'land', 'condo/townhome/row', '', 'detached', '1',
       'other', 'colonial', 'transitional', 'high', 'mobile/manufactured',
       'tri-level', 'craftsman', 'federal', 'multi', 'traditional',
       'custom', 'cooperative', 'contemporary/modern', 'cape', 'mobile',
       'miscellaneous', 'mfd/mobile', 'bungalow', 'spanish/mediterranean',
       'contemporary', 'multi-level', 'condo/unit', 'ranch', 'low-rise',
       'rancher', 'urban', 'two', 'garden', 'farms/ranches', 'a-frame',
       'attached', 'one', 'georgian', 'dwelling', 'victorian',
       'apartment', 'cluster', 'manufactured', 'condominium',
       'condo/townhome', 'hi', 'mediterranean', 'cabin', 'bermuda',
       'split', 'farm', 'split-level', 'singlefamilyresidence', 'log',
       'bilevel', 'mountain', 'commercial/industrial', 'cottage',
       'high-rise', 'mid-rise'

In [29]:
def transform_status(
    value: str
):  
    if isinstance(value, str):
        if  value in ['nan', '']:
            return 'not known'
        if  value in ['1']:
            return 'one'
        if  value in ['2', '2.5', '2-story']:
            return 'two'
        if  value in ['3']:
            return 'back on market'
        if  value in ['coming', 'c']:
            return 'coming soon'
        if value in ['p', 'pi', 'ps']:
            return 'pending'
        if value in ['u', 'uc']:
            return 'under'
        if value in ['re']:
            return 'reactivated'
        if value in ['ct']:
            return 'contract'
        if value in ['pre-foreclosure', 'foreclosure', 'foreclosed']:
            return 'foreclosure'
        if value in ['accepting', 'accepted']:
            return 'accepted'
        if value in ['contingency', 'conting']:
            return 'contingency'
        if value in ['condo', 'conditional']:
            return 'conditional'

    return value
data.status=data.status.apply(transform_status) #функция для снижения вариативности представленных значений, так как некоторые повторяются

### street

In [30]:
data.street.value_counts() #обзор значений в столбце

Address Not Disclosed               672
Undisclosed Address                 516
(undisclosed Address)               391
Address Not Available               175
Unknown Address                      72
                                   ... 
Lynmar Plan in Rough Hollow           1
MLS #: CORC5922640                    1
7784 Rosedale St                      1
Buildable plan: Residence 2 Plan      1
5983 Midcrown Dr                      1
Name: street, Length: 334447, dtype: int64

In [31]:
data['street']=data['street'].fillna(value='not known') #заполнение отсутствующих значений

### baths

In [32]:
data.baths.value_counts() #обзор значений в столбце

2 Baths          51953
3 Baths          35356
2                20429
2.0              16355
4 Baths          14712
                 ...  
32                   1
5.25 Baths           1
41.0                 1
Bathrooms: 21        1
44.0                 1
Name: baths, Length: 225, dtype: int64

In [33]:
data['baths']=pd.to_numeric(data['baths'],  errors='coerce') #приведение к числовому типу

In [34]:
data['baths']=data['baths'].fillna(value=data['baths'].mean()) #заполнение отсутствующих значений

In [35]:
data['baths']=data['baths'].astype(float).apply(lambda x:round(x, 1)) #приведение к типу float

In [36]:
data['baths'].unique() #обзор уникальных значений в столбце

array([  3.5,   4.2,   2. ,   3. ,   4. ,   1. ,   4.5,   5. ,   1.5,
         2.5,   8. ,   0. ,   7. ,   6. ,   5.5, 750. ,  10. ,  18. ,
         9. ,  12. ,   6.5,  14. ,  17. ,  16. ,   7.5,  11. ,  22. ,
        13. ,  32. ,  26. ,   9.5,   8.5,  35. ,  40. ,  29. ,  15. ,
        21. ,  20. ,  55. ,  41. ,  10.5,  27. ,  12.5,  24. ,  23. ,
        34. ,  64. ,  30. ,  44. ])

### homeFacts

In [37]:
data.homeFacts.value_counts() #обзор значений в столбце

{'atAGlanceFacts': [{'factValue': '', 'factLabel': 'Year built'}, {'factValue': '', 'factLabel': 'Remodeled year'}, {'factValue': '', 'factLabel': 'Heating'}, {'factValue': '', 'factLabel': 'Cooling'}, {'factValue': '', 'factLabel': 'Parking'}, {'factValue': '—', 'factLabel': 'lotsize'}, {'factValue': None, 'factLabel': 'Price/sqft'}]}                                                     7142
{'atAGlanceFacts': [{'factValue': None, 'factLabel': 'Year built'}, {'factValue': None, 'factLabel': 'Remodeled year'}, {'factValue': None, 'factLabel': 'Heating'}, {'factValue': None, 'factLabel': 'Cooling'}, {'factValue': None, 'factLabel': 'Parking'}, {'factValue': None, 'factLabel': 'lotsize'}, {'factValue': None, 'factLabel': 'Price/sqft'}]}                                          3484
{'atAGlanceFacts': [{'factValue': '', 'factLabel': 'Year built'}, {'factValue': '', 'factLabel': 'Remodeled year'}, {'factValue': '', 'factLabel': 'Heating'}, {'factValue': '', 'factLabel': 'Cooling'}, {'factVa

In [38]:
#q = list()
#for cur_dict in eval(q):
#   for key in eval(cur_dict):
#    for item in cur_dict[key]['atAGlanceFacts']:
#      if item['factValue'] not in ['', '-', None]:
#          new_q.append(item)

### fireplace

In [39]:
data['fireplace'].value_counts() #обзор значений в столбце

yes                                                                     49925
Yes                                                                     20619
1                                                                       14533
2                                                                        2432
Not Applicable                                                           1990
                                                                        ...  
Gas, Wood Burning, Two, Propane Logs Convey                                 1
Free-standing, Insert, Wood                                                 1
Wood Burning, Attached Fireplace Doors/Screen, Electric, Gas Starter        1
One, Living Room                                                            1
Ceiling Fan, SMAPL, Utility Connection, Walk-In Closets                     1
Name: fireplace, Length: 1652, dtype: int64

In [40]:
def transform_fireplace(
    value: str
):  
    if isinstance(value, str):
        if  value in value:
          return 'yes'
data['fireplace']=data['fireplace'].apply(transform_fireplace) #преобразование данных в столбце

In [41]:
data['fireplace']=data['fireplace'].fillna(value='not') #заполнение отсутствующих значений

In [42]:
data['fireplace'].value_counts() #обзор значений в столбце

not    271878
yes    102428
Name: fireplace, dtype: int64

### city

In [43]:
data['city'].value_counts() #обзор значений в столбце

Houston            24388
San Antonio        15496
Miami              15385
Jacksonville        9890
Dallas              8750
                   ...  
Los Altos Hills        1
Lake worth             1
Lisle                  1
Arrington              1
Blue Springs           1
Name: city, Length: 2019, dtype: int64

In [44]:
data['city']=data['city'].fillna(value='Houston')#заполнение отсутствующих значений наиболее часто встречаемым значением

### schools

In [45]:
data.schools.value_counts() #обзор значений в столбце

[{'rating': [], 'data': {'Distance': [], 'Grades': []}, 'name': []}]                                                                                                                                                                                                                                                                                       4169
[{'rating': ['4/10', '5/10', '6/10'], 'data': {'Distance': ['39.69mi', '39.69mi', '39.69mi'], 'Grades': ['9-12', '6-8', 'PK-5']}, 'name': ['Fort Hancock High School', 'Fort Hancock Middle School', 'Benito Martinez Elementary School']}]                                                                                                                 222
[{'rating': ['4/10', '6/10', '3/10'], 'data': {'Distance': ['3.62mi', '3.62mi', '3.62mi'], 'Grades': ['6-8', 'PK-5', '9-12']}, 'name': ['Horizon Middle School', 'Desert Hills Elementary School', 'Horizon High School']}]                                                                             

### sqft

In [46]:
data['sqft'].value_counts() #обзор значений в столбце

0                                          11853
1,200 sqft                                   824
1,000 sqft                                   643
1,100 sqft                                   566
1,800 sqft                                   558
                                           ...  
101,415 sqft                                   1
3938                                           1
Total interior livable area: 4,580 sqft        1
32,552 sqft                                    1
Total interior livable area: 4,615 sqft        1
Name: sqft, Length: 25369, dtype: int64

In [47]:
data['sqft'].unique() #обзор уникальных значений

array(['2900', '1,947 sqft', '3,000 sqft', ..., '4371', '13,870 sqft',
       'Total interior livable area: 4,615 sqft'], dtype=object)

In [48]:
data['sqft'] = pd.to_numeric(data['sqft'], errors='coerce') #приведение к числовому типу

In [49]:
data['sqft'].value_counts() #обзор значений в столбце

0.0       11853
960.0       284
800.0       257
900.0       234
850.0       177
          ...  
5031.0        1
4325.0        1
4974.0        1
469.0         1
4371.0        1
Name: sqft, Length: 5427, dtype: int64

In [50]:
data['sqft']=data['sqft'].fillna(round(data['sqft'].mean(), 1)) #заполнение отсутствующих значений 

In [51]:
data['sqft'].value_counts() #обзор значений в столбце

8927.6    312718
0.0        11853
960.0        284
800.0        257
900.0        234
           ...  
5031.0         1
4325.0         1
4974.0         1
469.0          1
4371.0         1
Name: sqft, Length: 5428, dtype: int64

### zipcode

In [52]:
data.zipcode.value_counts() #обзор значений в столбце

32137         2139
33131         1551
34747         1486
78245         1383
34759         1333
              ... 
92683            1
27610-2861       1
27613-4042       1
27612-5442       1
44704            1
Name: zipcode, Length: 4541, dtype: int64

In [53]:
data['zipcode'] = pd.to_numeric(data['zipcode'], errors='coerce') #приведение к числовому типу

In [54]:
data['zipcode']=data['zipcode'].fillna(0) #заполнение отсутствующих значений

### beds

In [55]:
data['beds'].value_counts() #обзор значений в столбце

3 Beds        52947
4 Beds        35149
3             31191
2 Beds        26084
4             19915
              ...  
8.93 acres        1
5,510 sqft        1
3.8 acres         1
7,104 sqft        1
8,479 sqft        1
Name: beds, Length: 1146, dtype: int64

In [56]:
data['beds'].unique() #обзор уникальных значений

array(['4', '3 Beds', '5 Beds', ..., '2.72 acres', '9,565 sqft',
       '8,479 sqft'], dtype=object)

In [57]:
def transform_beds(
    value: str
):  
    if isinstance(value, str):
        if  'acres' in value:
            return 0
        if  'sqft' in value:
            return 0
    return value
data['beds']=data['beds'].apply(transform_beds) #преобразование значений в столбце

In [58]:
data['beds'] = pd.to_numeric(data['beds'], errors='coerce') #приведение к числовому типу

In [59]:
data['beds']=data['beds'].fillna(round(data['beds'].mean(), 0)) #заполнение отсутствующих значений

In [60]:
data['beds'].value_counts() #обзор значений в столбце

3.0      310186
4.0       25146
2.0       18427
5.0        7764
1.0        5012
0.0        4359
6.0        2103
7.0         577
8.0         357
9.0         102
10.0         77
11.0         40
12.0         36
16.0         21
14.0         15
13.0         14
15.0         11
24.0          9
20.0          7
18.0          7
17.0          4
28.0          3
30.0          3
22.0          3
26.0          2
29.0          2
34.0          2
27.0          2
32.0          2
40.0          2
99.0          1
33.0          1
21.0          1
75.0          1
19.0          1
144.0         1
48.0          1
23.0          1
44.0          1
78.0          1
35.0          1
Name: beds, dtype: int64

### state

In [61]:
data.state.value_counts() #обзор значений в столбце

FL    114570
TX     83269
NY     24329
CA     23170
NC     21768
TN     18218
WA     13730
OH     12423
IL      8823
NV      8402
GA      6632
CO      6371
PA      5493
MI      5119
DC      4581
AZ      3347
IN      3280
OR      2774
MA      1494
UT      1319
MD      1086
VT       864
MO       832
VA       800
WI       452
NJ       436
ME       258
IA       242
KY        90
OK        49
MS        40
SC        28
MT         7
DE         5
Fl         1
BA         1
AL         1
OT         1
OS         1
Name: state, dtype: int64

### stories

In [62]:
data['stories'].value_counts() #обзор значений в столбце

1.0                                  66808
2.0                                  55003
1                                    22795
2                                    17976
3.0                                  11233
                                     ...  
Manufactured Home, Non-Site Built        1
Bedroom - Split Plan                     1
78                                       1
None                                     1
65.0                                     1
Name: stories, Length: 348, dtype: int64

In [63]:
data['stories'] = pd.to_numeric(data['stories'], errors='coerce') #приведение к числовому типу

In [64]:
data['stories'].value_counts() #обзор значений в столбце

1.0       90942
2.0       75986
3.0       15971
0.0       11440
9.0        3381
          ...  
1120.0        1
2.2           1
1002.0        1
96.0          1
65.0          1
Name: stories, Length: 85, dtype: int64

In [65]:
data['stories']=data['stories'].fillna(round(data['stories'].mean(), 0)) #заполнение отсутствующих значений

### mls-id

In [66]:
data['mls-id'].value_counts() #обзор значений в столбце

No           3
No MLS#      3
983469       2
241766       2
A10761504    2
            ..
1020314      1
A10762436    1
1592770      1
14201834     1
F10202858    1
Name: mls-id, Length: 24902, dtype: int64

In [67]:
data.drop(['mls-id',], axis=1, inplace=True) #удаление столбца, так как значения пересекаются со значениями столбца MlsId

### PrivatePool

In [68]:
data.PrivatePool.value_counts() #обзор значений в столбце

yes    28592
Yes    11434
Name: PrivatePool, dtype: int64

In [69]:
data['PrivatePool']=data['PrivatePool'].fillna(value='not') #заполнение отсутствующих зачений

In [70]:
#data['PrivatePool'] = data['PrivatePool'].astype(str)
data['PrivatePool'] = data['PrivatePool'].apply(lambda x: x.lower()) #преобразование данных

In [71]:
data['PrivatePool'].value_counts() #обзор значений в столбце

not    334280
yes     40026
Name: PrivatePool, dtype: int64

### MlsId

In [72]:
data['MlsId'].value_counts() #обзор значений в столбце

NO MLS                     24
No MLS #                   16
 A, Houston, TX 77008      13
 12A, Orlando, FL 32833    11
 B, Houston, TX 77008       9
                           ..
241208                      1
687215                      1
3866902                     1
19491906                    1
10374233                    1
Name: MlsId, Length: 232622, dtype: int64

In [73]:
data['MlsId'] = pd.to_numeric(data['MlsId'], errors='coerce') #преобразование к числовому типу 

In [74]:
data['MlsId'].value_counts() #обзор значений в столбце

2101941.0     6
1412350.0     6
2088662.0     6
14187092.0    5
14061735.0    5
             ..
10589197.0    1
3481963.0     1
1418405.0     1
14135173.0    1
10374233.0    1
Name: MlsId, Length: 161084, dtype: int64

In [75]:
data=data.drop(['MlsId',], axis=1) #удаление столбца, так как информацией из него можно пренебречь

# Data Preprocessing

In [76]:
data=data.drop(['homeFacts', 'schools',], axis=1) #удаляем столбцы, так как не удалось сделать их корректную обработку и вытащить значения
data=pd.DataFrame(data=data)

In [77]:
data.isna().sum() #проверяем,что у нас нет NaN  в датасете

status          0
propertyType    0
street          0
baths           0
fireplace       0
city            0
sqft            0
zipcode         0
beds            0
state           0
stories         0
PrivatePool     0
new_target      0
dtype: int64

In [78]:
data.info() #общий обзор получившегося датафрейма

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374306 entries, 0 to 377184
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   status        374306 non-null  object 
 1   propertyType  374306 non-null  object 
 2   street        374306 non-null  object 
 3   baths         374306 non-null  float64
 4   fireplace     374306 non-null  object 
 5   city          374306 non-null  object 
 6   sqft          374306 non-null  float64
 7   zipcode       374306 non-null  float64
 8   beds          374306 non-null  float64
 9   state         374306 non-null  object 
 10  stories       374306 non-null  float64
 11  PrivatePool   374306 non-null  object 
 12  new_target    374306 non-null  float64
dtypes: float64(6), object(7)
memory usage: 40.0+ MB


## Standartisation

In [79]:

scaled_features = data.copy()

col_names = ['baths', 'sqft', 'zipcode', 'beds', 'stories']
features = scaled_features[col_names]

scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)

scaled_features[col_names] = features

In [80]:
data=scaled_features

## FeatureEngineering

In [81]:
categorical_features=[ 'status', 'fireplace',  'PrivatePool'] #применение get_dummies для преобразования категориальных переменных, в которых не сильно высокая вариативность представленных уникальных значений и поэтому создаваемые новые признаки не будут ззамедлять работу нашего сервиса
data = pd.get_dummies(data, columns = categorical_features , prefix_sep = "_", drop_first = True)
data.head()

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
0,single,240 Heather Ln,0.004667,Southern Pines,6e-06,0.085646,0.027778,NC,0.001786,418000.0,...,0,0,0,0,0,0,0,0,1,0
1,single-family,12911 E Heroy Ave,0.0056,Spokane Valley,2e-05,0.299343,0.020833,WA,0.001786,310000.0,...,0,0,0,0,0,0,0,0,0,0
2,single-family,2005 Westridge Rd,0.0056,Los Angeles,2e-05,0.271685,0.020833,CA,0.000893,2895000.0,...,0,0,0,0,0,0,0,0,1,1
3,single-family,4311 Livingston Ave,0.0056,Dallas,2e-05,0.2269,0.020833,TX,0.002679,2395000.0,...,0,0,0,0,0,0,0,0,1,0
4,lot/land,1524 Kiscoe St,0.0056,Palm Bay,2e-05,0.099286,0.020833,FL,0.001786,5000.0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
le = LabelEncoder() #применение label_encoder для преобразования категориальных переменных,вариативность уникальных значений которых может достигать нескольких тысяч
data['propertyType'] = le.fit_transform(data['propertyType'])
data['street'] = le.fit_transform(data['street'])
data['city'] = le.fit_transform(data['city'])
data['state'] = le.fit_transform(data['state'])
data.head()

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
0,149,124833,0.004667,1705,6e-06,0.085646,0.027778,21,0.001786,418000.0,...,0,0,0,0,0,0,0,0,1,0
1,150,43403,0.0056,1715,2e-05,0.299343,0.020833,37,0.001786,310000.0,...,0,0,0,0,0,0,0,0,0,0
2,150,102248,0.0056,1031,2e-05,0.271685,0.020833,3,0.000893,2895000.0,...,0,0,0,0,0,0,0,0,1,1
3,150,197986,0.0056,430,2e-05,0.2269,0.020833,33,0.002679,2395000.0,...,0,0,0,0,0,0,0,0,1,0
4,105,68022,0.0056,1341,2e-05,0.099286,0.020833,7,0.001786,5000.0,...,0,0,0,0,0,0,0,0,0,0


## Outliers_preprocessing

### beds

In [83]:
lower_bound = data.beds.quantile(q=0.01)
upper_bound = data.beds.quantile(q=0.99)
data[(data.beds < lower_bound) | (data.beds > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
214,149,298217,0.006000,915,0.000007,0.104835,0.041667,7,0.001786,504000.0,...,0,0,0,0,0,0,0,0,0,0
260,125,189086,0.010667,1284,0.000020,0.285473,0.097222,3,0.001786,2100000.0,...,0,0,0,0,0,0,0,0,0,0
334,149,222062,0.009333,152,0.000020,0.062804,0.048611,15,0.002679,1995000.0,...,0,0,0,0,0,0,0,0,1,0
411,149,247664,0.008000,1388,0.000020,0.100034,0.041667,7,0.001786,1990000.0,...,0,0,0,0,0,0,0,0,0,1
951,125,76613,0.005333,1118,0.000020,0.100016,0.041667,7,0.001786,599000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376729,149,176629,0.006667,1422,0.000020,0.293378,0.041667,27,0.003571,1599000.0,...,0,0,0,0,0,0,0,0,0,0
376816,149,189919,0.009333,1589,0.000020,0.277891,0.041667,3,0.001786,18995000.0,...,0,0,0,0,0,0,0,0,1,0
376865,125,90599,0.009333,40,0.000020,0.279988,0.076389,3,0.001786,1750000.0,...,0,0,0,0,0,0,0,0,0,0
376884,149,152053,0.004667,967,0.000005,0.235471,0.041667,33,0.001786,182000.0,...,0,0,0,0,0,0,0,0,1,0


In [84]:
data.replace(to_replace=data[(data.beds < lower_bound) | (data.beds > upper_bound)].beds, value=data.beds.mean(), inplace=True)

In [85]:
data[(data.beds < lower_bound) | (data.beds > upper_bound)].beds

214       0.041667
260       0.097222
334       0.048611
411       0.041667
951       0.041667
            ...   
376729    0.041667
376816    0.041667
376865    0.076389
376884    0.041667
377016    0.041667
Name: beds, Length: 3412, dtype: float64

### stories

In [86]:
lower_bound = data.stories.quantile(q=0.025)
upper_bound = data.stories.quantile(q=0.975)
data[(data.stories < lower_bound) | (data.stories > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
67,40,101124,0.005600,320,0.000020,0.182868,0.020833,11,0.008036,749000.0,...,0,0,0,0,0,0,0,0,0,1
123,40,161824,0.005600,969,0.000020,0.268849,0.020833,23,0.008036,519900.0,...,0,0,0,0,0,0,0,0,0,1
170,40,187759,0.005600,609,0.000020,0.100493,0.020833,7,0.008036,279900.0,...,0,0,0,0,0,0,0,0,0,0
202,49,170416,0.005600,1226,0.000020,0.030264,0.020833,24,0.012500,455000.0,...,0,0,0,0,0,0,0,0,0,0
402,40,298042,0.005600,1369,0.000020,0.099642,0.020833,7,0.004464,150000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376880,49,229424,0.005600,210,0.000020,0.031583,0.020833,24,0.005357,229000.0,...,0,0,0,0,0,0,0,0,0,0
377023,40,64340,0.005600,215,0.000020,0.033870,0.020833,24,0.007143,750000.0,...,0,0,0,0,0,0,0,0,0,0
377090,40,207286,0.005600,796,0.000020,0.099627,0.020833,7,0.004464,125000.0,...,0,0,0,0,0,0,0,0,0,0
377102,40,166564,0.001333,609,0.000002,0.100493,0.006944,7,0.008036,149999.0,...,0,0,0,0,0,0,0,0,0,1


In [87]:
data[(data.stories < lower_bound) | (data.stories > upper_bound)].stories.apply(lambda x:data.stories.mean())

67        0.001756
123       0.001756
170       0.001756
202       0.001756
402       0.001756
            ...   
376880    0.001756
377023    0.001756
377090    0.001756
377102    0.001756
377181    0.001756
Name: stories, Length: 7834, dtype: float64

In [88]:
data[(data.stories < lower_bound) | (data.stories > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
67,40,101124,0.005600,320,0.000020,0.182868,0.020833,11,0.008036,749000.0,...,0,0,0,0,0,0,0,0,0,1
123,40,161824,0.005600,969,0.000020,0.268849,0.020833,23,0.008036,519900.0,...,0,0,0,0,0,0,0,0,0,1
170,40,187759,0.005600,609,0.000020,0.100493,0.020833,7,0.008036,279900.0,...,0,0,0,0,0,0,0,0,0,0
202,49,170416,0.005600,1226,0.000020,0.030264,0.020833,24,0.012500,455000.0,...,0,0,0,0,0,0,0,0,0,0
402,40,298042,0.005600,1369,0.000020,0.099642,0.020833,7,0.004464,150000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376880,49,229424,0.005600,210,0.000020,0.031583,0.020833,24,0.005357,229000.0,...,0,0,0,0,0,0,0,0,0,0
377023,40,64340,0.005600,215,0.000020,0.033870,0.020833,24,0.007143,750000.0,...,0,0,0,0,0,0,0,0,0,0
377090,40,207286,0.005600,796,0.000020,0.099627,0.020833,7,0.004464,125000.0,...,0,0,0,0,0,0,0,0,0,0
377102,40,166564,0.001333,609,0.000002,0.100493,0.006944,7,0.008036,149999.0,...,0,0,0,0,0,0,0,0,0,1


### baths

In [89]:
lower_bound = data.baths.quantile(q=0.025)
upper_bound = data.baths.quantile(q=0.975)
data[(data.baths < lower_bound) | (data.baths > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
75,149,190627,0.006000,915,0.000011,0.104832,0.034722,7,0.001786,414900.0,...,0,0,0,0,0,0,0,0,0,0
113,149,164991,0.006667,809,0.000020,0.232611,0.027778,33,0.001786,2197000.0,...,0,0,0,0,0,0,0,0,0,1
214,149,298217,0.006000,915,0.000007,0.104835,0.041667,7,0.001786,504000.0,...,0,0,0,0,0,0,0,0,0,0
260,125,189086,0.010667,1284,0.000020,0.285473,0.097222,3,0.001786,2100000.0,...,0,0,0,0,0,0,0,0,0,0
295,100,318872,0.000000,809,0.000000,0.232379,0.020833,33,0.001786,39900.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377066,149,204538,0.006667,430,0.000020,0.226972,0.034722,33,0.001786,1307000.0,...,0,0,0,0,0,0,0,0,1,1
377092,149,102465,0.006667,809,0.000020,0.232327,0.020833,33,0.003571,549000.0,...,0,0,0,0,0,0,0,0,0,0
377098,100,13755,0.000000,1415,0.000000,0.102436,0.020833,7,0.001786,7000.0,...,0,1,0,0,0,0,0,0,0,0
377152,149,80892,0.006667,1479,0.000020,0.270062,0.027778,23,0.001786,1000000.0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
data[(data.baths < lower_bound) | (data.baths > upper_bound)].baths.apply(lambda x:data.baths.mean())

75        0.005618
113       0.005618
214       0.005618
260       0.005618
295       0.005618
            ...   
377066    0.005618
377092    0.005618
377098    0.005618
377152    0.005618
377180    0.005618
Name: baths, Length: 11154, dtype: float64

In [91]:
data[(data.baths < lower_bound) | (data.baths > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
75,149,190627,0.006000,915,0.000011,0.104832,0.034722,7,0.001786,414900.0,...,0,0,0,0,0,0,0,0,0,0
113,149,164991,0.006667,809,0.000020,0.232611,0.027778,33,0.001786,2197000.0,...,0,0,0,0,0,0,0,0,0,1
214,149,298217,0.006000,915,0.000007,0.104835,0.041667,7,0.001786,504000.0,...,0,0,0,0,0,0,0,0,0,0
260,125,189086,0.010667,1284,0.000020,0.285473,0.097222,3,0.001786,2100000.0,...,0,0,0,0,0,0,0,0,0,0
295,100,318872,0.000000,809,0.000000,0.232379,0.020833,33,0.001786,39900.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377066,149,204538,0.006667,430,0.000020,0.226972,0.034722,33,0.001786,1307000.0,...,0,0,0,0,0,0,0,0,1,1
377092,149,102465,0.006667,809,0.000020,0.232327,0.020833,33,0.003571,549000.0,...,0,0,0,0,0,0,0,0,0,0
377098,100,13755,0.000000,1415,0.000000,0.102436,0.020833,7,0.001786,7000.0,...,0,1,0,0,0,0,0,0,0,0
377152,149,80892,0.006667,1479,0.000020,0.270062,0.027778,23,0.001786,1000000.0,...,0,0,0,0,0,0,0,0,0,0


### sqft

In [92]:
lower_bound = data.sqft.quantile(q=0.025)
upper_bound = data.sqft.quantile(q=0.975)
data[(data.sqft < lower_bound) | (data.sqft > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
3354,126,79900,0.024000,1031,0.000026,0.271637,0.222222,3,0.001786,3600000.0,...,0,0,0,0,0,0,0,0,0,0
3511,149,233558,0.005600,926,0.000020,0.274588,0.048611,3,0.001786,6950000.0,...,0,0,0,0,0,0,0,0,1,0
5294,149,154960,0.005600,1372,0.000020,0.133126,0.034722,25,0.001786,2195000.0,...,0,0,0,0,0,0,0,0,1,0
7324,126,114990,0.002667,967,0.000137,0.235468,1.000000,33,0.001786,1590000.0,...,0,0,0,0,0,0,0,0,0,0
7338,149,156004,0.005600,1915,0.000025,0.060390,0.048611,5,0.001786,8150000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373599,149,274440,0.005600,1683,0.000032,0.133171,0.034722,25,0.001786,995000.0,...,0,1,0,0,0,0,0,0,1,0
375044,126,222650,0.005600,1803,0.000053,0.296908,0.020833,37,0.001786,1100000.0,...,0,0,0,0,0,0,0,0,0,0
375276,149,215988,0.008667,1204,0.000023,0.112251,0.034722,32,0.002232,4950000.0,...,0,0,0,0,0,0,0,0,1,0
376348,149,100216,0.005600,200,0.000028,0.111714,0.034722,32,0.001786,3950000.0,...,0,0,0,0,0,0,0,0,1,0


In [93]:
data[(data.sqft < lower_bound) | (data.sqft > upper_bound)].drop(axis=0, columns='sqft', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [94]:
data[(data.sqft < lower_bound) | (data.sqft > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
3354,126,79900,0.024000,1031,0.000026,0.271637,0.222222,3,0.001786,3600000.0,...,0,0,0,0,0,0,0,0,0,0
3511,149,233558,0.005600,926,0.000020,0.274588,0.048611,3,0.001786,6950000.0,...,0,0,0,0,0,0,0,0,1,0
5294,149,154960,0.005600,1372,0.000020,0.133126,0.034722,25,0.001786,2195000.0,...,0,0,0,0,0,0,0,0,1,0
7324,126,114990,0.002667,967,0.000137,0.235468,1.000000,33,0.001786,1590000.0,...,0,0,0,0,0,0,0,0,0,0
7338,149,156004,0.005600,1915,0.000025,0.060390,0.048611,5,0.001786,8150000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373599,149,274440,0.005600,1683,0.000032,0.133171,0.034722,25,0.001786,995000.0,...,0,1,0,0,0,0,0,0,1,0
375044,126,222650,0.005600,1803,0.000053,0.296908,0.020833,37,0.001786,1100000.0,...,0,0,0,0,0,0,0,0,0,0
375276,149,215988,0.008667,1204,0.000023,0.112251,0.034722,32,0.002232,4950000.0,...,0,0,0,0,0,0,0,0,1,0
376348,149,100216,0.005600,200,0.000028,0.111714,0.034722,32,0.001786,3950000.0,...,0,0,0,0,0,0,0,0,1,0


### zipcode

In [95]:
lower_bound = data.zipcode.quantile(q=0.025)
upper_bound = data.zipcode.quantile(q=0.975)
data[(data.zipcode < lower_bound) | (data.zipcode > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
1,150,43403,0.005600,1715,0.000020,0.299343,0.020833,37,0.001786,310000.0,...,0,0,0,0,0,0,0,0,0,0
17,167,124771,0.005600,1226,0.000020,0.030252,0.020833,24,0.001786,2650000.0,...,0,0,0,0,0,0,0,0,0,0
42,130,324918,0.005600,1226,0.000020,0.030189,0.020833,24,0.001786,850000.0,...,0,0,0,0,0,0,0,0,0,0
69,149,134145,0.002667,958,0.000005,0.026252,0.013889,22,0.000893,269000.0,...,0,0,0,0,0,0,0,0,0,0
78,40,168245,0.005600,182,0.000020,0.006417,0.020833,14,0.001786,916900.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377044,40,260741,0.002667,182,0.000020,0.006390,0.020833,14,0.000893,1450000.0,...,0,0,0,0,0,0,0,0,0,0
377062,42,185573,0.004667,958,0.000007,0.026252,0.034722,22,0.002679,569900.0,...,0,0,0,0,0,0,0,0,0,0
377088,150,219118,0.005600,2004,0.000020,0.298392,0.020833,37,0.001786,299900.0,...,0,0,0,0,0,0,0,0,1,0
377108,40,255894,0.005600,182,0.000020,0.006429,0.020833,14,0.001786,399000.0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
data[(data.zipcode > upper_bound)].drop(axis=0, columns='zipcode', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [97]:
data[(data.zipcode < lower_bound) | (data.zipcode > upper_bound)]

Unnamed: 0,propertyType,street,baths,city,sqft,zipcode,beds,state,stories,new_target,...,status_option,status_pending,"status_pending,",status_pf,status_price change,status_reactivated,status_temporary,status_under,fireplace_yes,PrivatePool_yes
1,150,43403,0.005600,1715,0.000020,0.299343,0.020833,37,0.001786,310000.0,...,0,0,0,0,0,0,0,0,0,0
17,167,124771,0.005600,1226,0.000020,0.030252,0.020833,24,0.001786,2650000.0,...,0,0,0,0,0,0,0,0,0,0
42,130,324918,0.005600,1226,0.000020,0.030189,0.020833,24,0.001786,850000.0,...,0,0,0,0,0,0,0,0,0,0
69,149,134145,0.002667,958,0.000005,0.026252,0.013889,22,0.000893,269000.0,...,0,0,0,0,0,0,0,0,0,0
78,40,168245,0.005600,182,0.000020,0.006417,0.020833,14,0.001786,916900.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377044,40,260741,0.002667,182,0.000020,0.006390,0.020833,14,0.000893,1450000.0,...,0,0,0,0,0,0,0,0,0,0
377062,42,185573,0.004667,958,0.000007,0.026252,0.034722,22,0.002679,569900.0,...,0,0,0,0,0,0,0,0,0,0
377088,150,219118,0.005600,2004,0.000020,0.298392,0.020833,37,0.001786,299900.0,...,0,0,0,0,0,0,0,0,1,0
377108,40,255894,0.005600,182,0.000020,0.006429,0.020833,14,0.001786,399000.0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374306 entries, 0 to 377184
Data columns (total 38 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   propertyType           374306 non-null  int64  
 1   street                 374306 non-null  int64  
 2   baths                  374306 non-null  float64
 3   city                   374306 non-null  int64  
 4   sqft                   374306 non-null  float64
 5   zipcode                374306 non-null  float64
 6   beds                   374306 non-null  float64
 7   state                  374306 non-null  int64  
 8   stories                374306 non-null  float64
 9   new_target             374306 non-null  float64
 10  status_active          374306 non-null  uint8  
 11  status_apartment       374306 non-null  uint8  
 12  status_auction         374306 non-null  uint8  
 13  status_back on market  374306 non-null  uint8  
 14  status_closed          374306 non-nu

# Data Split

In [99]:
X = data.drop(['new_target'], axis=1) 
y = data['new_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

## Feature Importance

# ML

## Gradient Boosting for regression.

In [100]:
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]

#random_grid = {'n_estimators': n_estimators}
#gbr = GradientBoostingRegressor(random_state=42)
#gbr_random = RandomizedSearchCV(estimator=gbr, param_distributions=random_grid, n_iter=30, 
#                               cv=3, verbose=2, random_state=42, return_train_score=True)
#gbr_random.fit(X_train, np.log(y_train))
#print(gbr_random.best_params_)

In [101]:
gbr = GradientBoostingRegressor(random_state=42,
                              n_estimators=1000)
gbr.fit(X_train, np.log(y_train))

GradientBoostingRegressor(n_estimators=1000, random_state=42)

In [102]:
predict_test_gbr = np.exp(gbr.predict(X_test))

In [103]:
print((mape(y_test, predict_test_gbr)))

11.065065152306104


## A random forest regressor.

In [104]:
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 500, num = 3)]
#min_samples_split=[1, 2, 3, 5]
#min_samples_leaf=[1, 2, 3, 5]
#max_features=[0.25, 0.5, 0.75, 1.0]
#bootstrap=[True, False]
#max_depth = [int(x) for x in np.linspace(70, 200, num = 11)]
#max_depth.append(None)


#random_grid = {'n_estimators': n_estimators,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'max_features': max_features,
#               'bootstrap': bootstrap,
#               'max_depth': max_depth}
#rf = RandomForestRegressor(random_state=42)
#rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=2, 
#                               cv=3, verbose=2, random_state=42, return_train_score=True)
#rf_random.fit(X_train, np.log(y_train))
#print(rf_random.best_params_)

In [105]:
rf = RandomForestRegressor(random_state=42,
                           n_estimators=600 ,
                            max_depth=161,
                          min_samples_split=2 ,
                           min_samples_leaf=3 ,
                            max_features=0.75,
                            bootstrap=False )
rf.fit(X_train, np.log(y_train))

RandomForestRegressor(bootstrap=False, max_depth=161, max_features=0.75,
                      min_samples_leaf=3, n_estimators=600, random_state=42)

In [106]:
predict_test_rt = np.exp(rf.predict(X_test))

In [107]:
print((mape(y_test, predict_test_rt)))

27.98727423162839


## GaussianProcessRegressor

In [108]:
gpr= GaussianProcessRegressor(kernel=None, 
                               alpha=1e-10,
                               optimizer='fmin_l_bfgs_b',
                               n_restarts_optimizer=0, 
                               normalize_y=False,
                               random_state=42) 

In [109]:
predict_test_gpr = np.exp(gpr.predict(X_test))

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [110]:
print((mape(y_test, predict_test_gpr)))

0.9999489868044902


## SGDRegressor

In [111]:
#loss=['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
#penalty=['l2', 'l1', 'elasticnet']
#learning_rate=['constant','optimal','invscaling','adaptive']

#random_grid = {'loss':loss,
#                'penalty':penalty,
#              'learning_rate':learning_rate}
#sgd = SGDRegressor(random_state=42)
#sgd_random = RandomizedSearchCV(estimator=sgd, param_distributions=random_grid, n_iter=3, 
#                               cv=3, verbose=2, random_state=42, return_train_score=True)
#sgd_random.fit(X_train, np.log(y_train))
#print(sgd_random.best_params_)

In [112]:
sgd = SGDRegressor(loss='huber',
                    penalty='l1',
                    learning_rate='adaptive', 
                    early_stopping=True,
                    random_state=42,
                    validation_fraction=0.2,
                    n_iter_no_change=5 )
sgd.fit(X_train, np.log(y_train))

SGDRegressor(early_stopping=True, learning_rate='adaptive', loss='huber',
             penalty='l1', random_state=42, validation_fraction=0.2)

In [113]:
predict_test_sgd = np.exp(sgd.predict(X_test))

In [114]:
print((mape(y_test, predict_test_sgd)))

1.0211067687624333


## KNeighborsRegressor

In [115]:
#weights=['uniform', 'distance']
#algorithm=['auto', 'ball_tree', 'kd_tree', 'brute']
#p=[1, 2]

#random_grid = {'weights':weights,
#                'algorithm':algorithm,
#                'p':p}
#knr = KNeighborsRegressor()
#knr_random = RandomizedSearchCV(estimator=knr, param_distributions=random_grid, n_iter=1, 
#                               cv=3, verbose=2,  return_train_score=True)
#knr_random.fit(X_train, np.log(y_train))
#print(knr_random.best_params_)

In [116]:
knr = KNeighborsRegressor(weights='uniform',
                algorithm='brute',
                p=2)
knr.fit(X_train, np.log(y_train))

KNeighborsRegressor(algorithm='brute')

In [117]:
predict_test_knr = np.exp(knr.predict(X_test))

In [118]:
print((mape(y_test, predict_test_knr)))

7.746630310429071


## Stack of estimators with a final regressor.

In [119]:
#estimators = [('gpr', GaussianProcessRegressor(random_state=42)),
#               ('sgd', SGDRegressor(random_state=42))]
#sr = StackingRegressor(estimators=estimators)
#sr.fit(X_train, np.log(y_train)) Не хватает объема ОЗУ

In [120]:
#estimators = [('gbr', GradientBoostingRegressor(random_state=42)),
#               ('rfr', RandomForestRegressor(random_state=42,
#                                             n_estimators=600 ,
#                                            max_depth=161,
#                                            min_samples_split=2 ,
#                                            min_samples_leaf=3 ,
#                                            max_features=0.75,
#                                            bootstrap=False))]
#sr = StackingRegressor(estimators=estimators)
#sr.fit(X_train, np.log(y_train)) Не хватает объема ОЗУ

In [121]:
#predict_test = np.exp(sr.predict(X_test))

In [122]:
#print((mape(y_test, predict_test)))