In [41]:

import warnings

import pandas as pd
%matplotlib inline

warnings.filterwarnings("ignore")

In [42]:
try:
    df = pd.read_csv("data/data_deals.csv")
    df.drop(columns=['_id'])
    print(df.columns)
    print(df.shape)
except Exception:
    print("Seems we have a problem")


Index(['_id', 'sale_day', 'address', 'property_type', 'rooms_number', 'floor',
       'build_year', 'building_mr', 'sold_part', 'full_price', 'city'],
      dtype='object')
(517569, 11)


In [43]:
# sale_day "05-06-2016"
# build_year "2017"
df['sale_day'] = pd.to_datetime(df['sale_day'], format="%d-%m-%Y")
df['build_year'] = pd.to_datetime(df['build_year'], format="%Y")
print(df.head(1))


                        _id   sale_day        address    property_type  \
0  5cab7d497e16a98c7a79bd4a 2018-12-25  יונה הנביא 36  דירה בבית קומות   

   rooms_number floor build_year  building_mr sold_part   full_price  \
0           5.0  קרקע 2017-01-01          126       NaN  5,470,085 ₪   

          city  
0  תל אביב יפו  


In [44]:
# remove properties that their date is before 2010
from_year = 2010
df = df.loc[(df['sale_day'].dt.year >= from_year)]
df.shape

(321628, 11)

In [45]:
# List unique values in the df['name'] column
cols = ['full_price', 'sold_part', 'floor']
for col in cols:
    try:
        df.replace({col: {'₪': '',
                          '\$': '',
                          'קרקע': 0,
                          'לא צוין מחיר': None,
                          'לא צוין': None,
                          ',': '',
                          '%': '',
                          ' ': ''}}, regex=True, inplace=True)
    except TypeError:
        # in case of double replacement
        pass
df.shape

(321628, 11)

In [46]:
# remove sale date 'day' value
df['sale_day'] = df['sale_day']
df['sale_day_year'] = df['sale_day'].apply(lambda x: x.strftime('%Y'))
df['sale_day_month'] = df['sale_day'].apply(lambda x: x.strftime('%m'))

In [47]:
# Getting Description
df = df[pd.notnull(df['floor'])]
df['floor'] = df['floor'].astype(int)
df['rooms_number'] = df['rooms_number'].astype(int)
df['full_price'] = df['full_price'].astype(int)
df['full_price'].describe()

count    3.192810e+05
mean     1.265348e+06
std      1.062363e+06
min      2.000000e+00
25%      6.600000e+05
50%      1.100000e+06
75%      1.600000e+06
max      1.322710e+08
Name: full_price, dtype: float64

In [48]:

split_data = df['address'].str.rsplit(n=1, expand=True)
df['street_name'] = split_data[0]
df['street_number'] = split_data[1]


In [49]:
# format sold_part to full_price
df['sold_part'] = df['sold_part'].astype(float)
df['sold_part'].fillna(100, inplace=True)
df['final_price'] = (100 / df['sold_part']) * df['full_price']
df['final_price'] = df['final_price'].astype(int)
df.drop(columns=['sold_part', 'full_price'], inplace=True)
print(df.head(5))


                        _id   sale_day        address    property_type  \
0  5cab7d497e16a98c7a79bd4a 2018-12-25  יונה הנביא 36  דירה בבית קומות   
1  5cab7d497e16a98c7a79bd4b 2018-12-07  יונה הנביא 25  דירה בבית קומות   
2  5cab7d497e16a98c7a79bd4c 2018-05-10  יונה הנביא 26  דירה בבית קומות   
3  5cab7d497e16a98c7a79bd4d 2018-01-24  יונה הנביא 24  דירה בבית קומות   
4  5cab7d497e16a98c7a79bd4e 2017-09-13  יונה הנביא 26  דירה בבית קומות   

   rooms_number  floor build_year  building_mr         city sale_day_year  \
0             5      0 2017-01-01          126  תל אביב יפו          2018   
1             3      1 2018-01-01           78  תל אביב יפו          2018   
2             3      0 1950-01-01           58  תל אביב יפו          2018   
3             1      1 1960-01-01           23  תל אביב יפו          2018   
4             2      1 1930-01-01           58  תל אביב יפו          2017   

  sale_day_month street_name street_number  final_price  
0             12  יונה הנביא      

In [50]:
# remove properties that their price is too low or too high
min_price = 100000
max_price = 25000000

In [51]:
# remove properties that their price is too low or too high
df = df.loc[(min_price < df['final_price']) & (df['final_price'] < max_price)]
df.shape

(319055, 14)

In [52]:
df.to_csv('data/data_deals_done.csv', encoding='utf-8', index=False)