In [1]:
import pandas as pd


In [2]:
try:
    df = pd.read_csv("data/data_deals.csv")
    df.drop(columns=['_id'])
    print(df.columns)
    print(df.shape)
except Exception:
    print("Seems we have a problem")


Index(['_id', 'sale_day', 'address', 'property_type', 'rooms_number', 'floor',
       'build_year', 'building_mr', 'sold_part', 'full_price', 'city'],
      dtype='object')
(517569, 11)


In [3]:
# sale_day "05-06-2016"
# build_year "2017"
df['sale_day'] = pd.to_datetime(df['sale_day'], format="%d-%m-%Y")
df['build_year'] = pd.to_datetime(df['build_year'], format="%Y")
print(df.head(1))


                        _id   sale_day        address    property_type  \
0  5cab7d497e16a98c7a79bd4a 2018-12-25  יונה הנביא 36  דירה בבית קומות   

   rooms_number floor build_year  building_mr sold_part   full_price  \
0           5.0  קרקע 2017-01-01          126       NaN  5,470,085 ₪   

          city  
0  תל אביב יפו  


In [4]:
# remove properties that their date is before 2010
from_year = 2010
df = df.loc[(df['sale_day'].dt.year >= from_year)]
df.shape


(321628, 11)

In [5]:
# remove rows that their build_year is before 1945
from_year = 1945
df = df.loc[(df['build_year'].dt.year >= from_year)]
df['build_year'] = df['build_year'].apply(lambda x: x.strftime('%Y')).astype(int)
df.shape

(311848, 11)

In [6]:

split_data = df['address'].str.rsplit(n=1, expand=True)
df['street'] = split_data[0]
df['street_number'] = split_data[1]


In [7]:
try:
    nmap = pd.read_csv("data/map_address_neighborhood.csv")
    print(nmap.columns)
    print(nmap.shape)
except Exception:
    print("Seems we have a problem")


df = pd.merge(df, nmap,  how='inner', on=['city', 'street'])

df.shape


Index(['street', 'city', 'neighborhood', 'address_area',
       'address_neighborhood'],
      dtype='object')
(10971, 5)


(306832, 16)

In [8]:
# List unique values in the df['name'] column
cols = ['full_price', 'sold_part', 'floor']
for col in cols:
    try:
        df.replace({col: {'₪': '',
                          '\$': '',
                          'קרקע': 0,
                          'לא צוין מחיר': None,
                          'לא צוין': None,
                          ',': '',
                          '%': '',
                          ' ': ''}}, regex=True, inplace=True)
    except TypeError:
        # in case of double replacement
        pass
df.shape

(306832, 16)

In [9]:
# remove sale date 'day' value
df['sale_day'] = df['sale_day']
df['sale_day_year'] = df['sale_day'].apply(lambda x: x.strftime('%Y')).astype(int)
df['sale_day_month'] = df['sale_day'].apply(lambda x: x.strftime('%m')).astype(int)

In [10]:
# Getting Description
df = df[pd.notnull(df['floor'])]
df['floor'] = df['floor'].astype(int)
df['building_mr'] = df['building_mr'].astype(int)
df['rooms_number'] = df['rooms_number'].astype(int)
df['full_price'] = df['full_price'].astype(int)
df['full_price'].describe()


count    3.046380e+05
mean     1.279830e+06
std      1.071040e+06
min      2.000000e+00
25%      6.800000e+05
50%      1.109000e+06
75%      1.619000e+06
max      1.322710e+08
Name: full_price, dtype: float64

In [11]:
# format sold_part to full_price
df['sold_part'] = df['sold_part'].astype(float)
df['sold_part'].fillna(100, inplace=True)
df['final_price'] = (100 / df['sold_part']) * df['full_price']
df['final_price'] = df['final_price'].astype(int)
df.drop(columns=['sold_part', 'full_price'], inplace=True)
print(df.head(5))


                        _id   sale_day        address    property_type  \
0  5cab7d497e16a98c7a79bd4a 2018-12-25  יונה הנביא 36  דירה בבית קומות   
1  5cab7d497e16a98c7a79bd4b 2018-12-07  יונה הנביא 25  דירה בבית קומות   
2  5cab7d497e16a98c7a79bd4c 2018-05-10  יונה הנביא 26  דירה בבית קומות   
3  5cab7d497e16a98c7a79bd4d 2018-01-24  יונה הנביא 24  דירה בבית קומות   
4  5cab7d497e16a98c7a79bd4f 2017-08-03  יונה הנביא 20  דירה בבית קומות   

   rooms_number  floor  build_year  building_mr         city      street  \
0             5      0        2017          126  תל אביב יפו  יונה הנביא   
1             3      1        2018           78  תל אביב יפו  יונה הנביא   
2             3      0        1950           58  תל אביב יפו  יונה הנביא   
3             1      1        1960           23  תל אביב יפו  יונה הנביא   
4             3      1        1999           68  תל אביב יפו  יונה הנביא   

  street_number neighborhood address_area address_neighborhood  sale_day_year  \
0            36  

In [12]:
# remove properties that their price is too low or too high
min_price = 100000
max_price = 25000000

In [13]:
# remove properties that their price is too low or too high
df = df.loc[(min_price < df['final_price']) & (df['final_price'] < max_price)]
df.shape

(304431, 17)

In [14]:
# Sort the rows of dataframe by column 'final_price' inplace
df.sort_values(by='final_price' , inplace=True, ascending=False)

In [15]:
df.to_csv('data/data_deals_done.csv', encoding='utf-8', index=False)