# Deals - RandomForestClassifier
## Import and Load the Data

In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

%matplotlib inline

warnings.filterwarnings("ignore")

In [2]:
try:
    df = pd.read_csv("data/data_deals_done.csv")
except Exception:
    print("Seems we have a problem")

In [3]:
print("File Columns are:", df.columns)
print("Number of rows:", df.shape[0])

File Columns are: Index(['sale_day', 'address', 'property_type', 'rooms_number', 'floor',
       'build_year', 'building_mr', 'city', 'final_price', 'sale_day_year',
       'sale_day_month', 'street', 'street_number', 'neighborhood',
       'address_area', 'address_neighborhood'],
      dtype='object')
Number of rows: 474665


In [4]:
used_features = [
    'street', 
    'neighborhood',
    'property_type', 
    'rooms_number', 
    'floor',
    'build_year', 
    'building_mr',
    'city',
    'sale_day_year'
]

In [5]:
#------------------------------------------------------------------------------
# accept a dataframe, remove outliers, return cleaned data in a new dataframe
# see http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
#------------------------------------------------------------------------------
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.90)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out


In [6]:
_cities = df['city'].unique()

cities = []
for city in _cities:
    _df = df.loc[(df['city'] == city)]    
    if len(_df.index) > 250:
        cities.append(city)

test_size = 0.3

cities = {city: {"processor": None, "algo": None, "mse": None, "fig": None} for city in cities if '/' not in city}

print("Total number of cities is:", len(cities))


Total number of cities is: 77


In [7]:

for city in cities.keys():
    print(city)
    _df = df.loc[(df['city'] == city)]    
    _df = remove_outlier(_df, 'final_price')
    
    X = _df[used_features]
    y = _df['final_price']
    y, info = pd.qcut(np.array(y), 10, retbins=True)
    print(info)
    y = pd.DataFrame(y.codes)
    
    ohe = preprocessing.OneHotEncoder()
    X = ohe.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=101)

    rfc = RandomForestClassifier(n_estimators=150, random_state=1)
    rfc.fit(X_train,y_train)

    predictions = rfc.predict(X_test)
    print(classification_report(y_test,predictions))
    print(confusion_matrix(y_test,predictions))


חיפה
[  27840.   325000.   417229.2  500450.   595000.   693000.   800000.
  960000.  1200000.  1501684.6 3180000. ]
              precision    recall  f1-score   support

           0       0.61      0.71      0.66      1105
           1       0.44      0.43      0.44      1103
           2       0.36      0.33      0.35      1083
           3       0.36      0.36      0.36      1072
           4       0.36      0.35      0.35      1095
           5       0.35      0.36      0.36      1067
           6       0.39      0.37      0.38      1123
           7       0.44      0.43      0.44      1221
           8       0.45      0.43      0.44       982
           9       0.69      0.74      0.71      1077

   micro avg       0.45      0.45      0.45     10928
   macro avg       0.45      0.45      0.45     10928
weighted avg       0.45      0.45      0.45     10928

[[788 208  59  20  17   7   4   1   1   0]
 [294 471 179  79  36  17  16   6   2   3]
 [ 99 242 357 233  85  41  13   6   4 

              precision    recall  f1-score   support

           0       0.65      0.84      0.73        57
           1       0.44      0.43      0.44        60
           2       0.47      0.42      0.45        66
           3       0.42      0.50      0.46        64
           4       0.46      0.38      0.42        68
           5       0.44      0.40      0.42        70
           6       0.42      0.33      0.37        64
           7       0.34      0.46      0.39        52
           8       0.41      0.33      0.37        60
           9       0.55      0.59      0.57        70

   micro avg       0.47      0.47      0.47       631
   macro avg       0.46      0.47      0.46       631
weighted avg       0.46      0.47      0.46       631

[[48  7  0  0  1  0  1  0  0  0]
 [16 26 13  2  1  1  1  0  0  0]
 [ 6 14 28 12  2  3  1  0  0  0]
 [ 2  9  8 32  8  3  2  0  0  0]
 [ 0  3  5 14 26 12  5  2  0  1]
 [ 2  0  2 10 11 28  8  4  4  1]
 [ 0  0  2  2  5 12 21 13  5  4]
 [ 0  0  0

              precision    recall  f1-score   support

           0       0.65      0.67      0.66       523
           1       0.44      0.50      0.47       506
           2       0.42      0.36      0.39       519
           3       0.38      0.41      0.39       496
           4       0.37      0.35      0.36       540
           5       0.39      0.41      0.40       550
           6       0.41      0.33      0.37       526
           7       0.45      0.51      0.48       525
           8       0.60      0.62      0.61       526
           9       0.86      0.79      0.82       535

   micro avg       0.50      0.50      0.50      5246
   macro avg       0.50      0.50      0.49      5246
weighted avg       0.50      0.50      0.50      5246

[[350 130  24   6   7   4   0   1   1   0]
 [131 255  84  21   8   5   2   0   0   0]
 [ 31 136 186 105  39  10   6   3   1   2]
 [ 12  36 102 204 102  29   9   1   1   0]
 [  6  15  33 147 187 118  23   7   3   1]
 [  5   7  12  42 106 225 

              precision    recall  f1-score   support

           0       0.90      0.88      0.89        43
           1       0.51      0.50      0.51        40
           2       0.51      0.57      0.54        47
           3       0.52      0.60      0.56        45
           4       0.56      0.59      0.57        51
           5       0.59      0.49      0.54        53
           6       0.47      0.56      0.51        41
           7       0.43      0.42      0.43        45
           8       0.44      0.41      0.43        41
           9       0.74      0.61      0.67        57

   micro avg       0.57      0.57      0.57       463
   macro avg       0.57      0.56      0.56       463
weighted avg       0.57      0.57      0.57       463

[[38  4  0  1  0  0  0  0  0  0]
 [ 3 20 13  1  2  0  0  1  0  0]
 [ 0 11 27  8  1  0  0  0  0  0]
 [ 0  3  9 27  5  1  0  0  0  0]
 [ 1  1  1  8 30  7  1  2  0  0]
 [ 0  0  3  3 12 26  9  0  0  0]
 [ 0  0  0  2  2  5 23  3  4  2]
 [ 0  0  0

              precision    recall  f1-score   support

           0       0.58      0.74      0.65        42
           1       0.39      0.58      0.46        33
           2       0.40      0.24      0.30        49
           3       0.33      0.40      0.36        48
           4       0.29      0.18      0.23        49
           5       0.29      0.27      0.28        45
           6       0.37      0.43      0.40        37
           7       0.39      0.38      0.38        40
           8       0.39      0.46      0.42        48
           9       0.70      0.58      0.64        53

   micro avg       0.42      0.42      0.42       444
   macro avg       0.41      0.43      0.41       444
weighted avg       0.42      0.42      0.41       444

[[31  6  3  1  0  1  0  0  0  0]
 [ 7 19  3  2  1  0  1  0  0  0]
 [ 5 10 12 17  3  1  0  1  0  0]
 [ 2  6  5 19  6  3  5  1  0  1]
 [ 3  4  2 11  9  7  6  5  1  1]
 [ 2  2  2  4  6 12  8  4  4  1]
 [ 0  0  0  2  3  9 16  4  2  1]
 [ 1  0  2

              precision    recall  f1-score   support

           0       0.68      0.69      0.69        65
           1       0.41      0.47      0.44        53
           2       0.36      0.35      0.36        69
           3       0.41      0.44      0.42        68
           4       0.38      0.38      0.38        61
           5       0.39      0.42      0.41        66
           6       0.54      0.44      0.48        64
           7       0.54      0.49      0.52        51
           8       0.64      0.76      0.69        58
           9       0.86      0.73      0.79        70

   micro avg       0.52      0.52      0.52       625
   macro avg       0.52      0.52      0.52       625
weighted avg       0.52      0.52      0.52       625

[[45 11  4  3  1  0  0  1  0  0]
 [12 25  9  5  1  1  0  0  0  0]
 [ 2 16 24 18  7  2  0  0  0  0]
 [ 3  4 10 30 11  7  1  1  1  0]
 [ 1  2 10  8 23 12  3  1  0  1]
 [ 1  2  8  6 10 28  8  2  0  1]
 [ 2  1  0  3  6 14 28  8  1  1]
 [ 0  0  0

              precision    recall  f1-score   support

           0       0.72      0.78      0.75       628
           1       0.58      0.54      0.56       645
           2       0.54      0.52      0.53       625
           3       0.48      0.50      0.49       638
           4       0.49      0.47      0.48       688
           5       0.41      0.42      0.42       611
           6       0.47      0.47      0.47       620
           7       0.52      0.53      0.53       722
           8       0.53      0.49      0.51       616
           9       0.74      0.76      0.75       645

   micro avg       0.55      0.55      0.55      6438
   macro avg       0.55      0.55      0.55      6438
weighted avg       0.55      0.55      0.55      6438

[[492  82  22  11  12   7   1   1   0   0]
 [153 347 108  16  12   2   3   4   0   0]
 [ 20 129 327 117  16  13   3   0   0   0]
 [ 13  30 117 319 120  26   2   9   0   2]
 [  5   9  23 140 324 134  29  18   5   1]
 [  3   2   6  34 124 258 

              precision    recall  f1-score   support

           0       0.50      0.64      0.56        33
           1       0.26      0.23      0.25        30
           2       0.24      0.24      0.24        25
           3       0.14      0.22      0.17        18
           4       0.37      0.35      0.36        31
           5       0.12      0.09      0.11        32
           6       0.55      0.35      0.43        31
           7       0.25      0.29      0.27        24
           8       0.67      0.29      0.41        41
           9       0.38      0.74      0.50        23

   micro avg       0.34      0.34      0.34       288
   macro avg       0.35      0.35      0.33       288
weighted avg       0.37      0.34      0.34       288

[[21  4  3  2  1  0  0  0  0  2]
 [ 7  7  6  1  4  3  1  1  0  0]
 [ 7  5  6  5  0  0  0  1  0  1]
 [ 1  1  2  4  5  4  0  0  0  1]
 [ 2  3  1  5 11  8  1  0  0  0]
 [ 0  1  2  8  6  3  3  8  1  0]
 [ 2  4  2  1  1  4 11  4  1  1]
 [ 1  2  2

              precision    recall  f1-score   support

           0       0.60      0.76      0.67       170
           1       0.42      0.40      0.41       176
           2       0.40      0.43      0.42       189
           3       0.27      0.23      0.25       201
           4       0.35      0.33      0.34       197
           5       0.37      0.39      0.38       191
           6       0.34      0.34      0.34       175
           7       0.47      0.40      0.43       195
           8       0.47      0.50      0.49       174
           9       0.71      0.72      0.71       196

   micro avg       0.45      0.45      0.45      1864
   macro avg       0.44      0.45      0.44      1864
weighted avg       0.44      0.45      0.44      1864

[[129  29   5   3   2   2   0   0   0   0]
 [ 40  70  44  14   5   2   0   1   0   0]
 [ 10  34  81  37  15   7   1   1   3   0]
 [ 17  22  40  46  46  15   7   3   4   1]
 [  7   2  15  38  65  34  22   9   1   4]
 [  4   5  13  15  28  74 

              precision    recall  f1-score   support

           0       0.67      0.82      0.73        22
           1       0.92      0.72      0.81        32
           2       0.56      0.75      0.64        20
           3       0.33      0.41      0.37        17
           4       0.32      0.28      0.30        25
           5       0.47      0.38      0.42        21
           6       0.41      0.41      0.41        22
           7       0.38      0.37      0.37        30
           8       0.60      0.33      0.43        27
           9       0.47      0.67      0.55        27

   micro avg       0.51      0.51      0.51       243
   macro avg       0.51      0.51      0.50       243
weighted avg       0.53      0.51      0.51       243

[[18  2  1  0  1  0  0  0  0  0]
 [ 4 23  4  0  1  0  0  0  0  0]
 [ 1  0 15  3  1  0  0  0  0  0]
 [ 1  0  6  7  2  1  0  0  0  0]
 [ 3  0  1  7  7  2  3  0  0  2]
 [ 0  0  0  1  6  8  2  2  1  1]
 [ 0  0  0  0  3  3  9  5  0  2]
 [ 0  0  0

              precision    recall  f1-score   support

           0       0.59      0.78      0.67        41
           1       0.56      0.51      0.54        43
           2       0.64      0.61      0.62        49
           3       0.52      0.57      0.54        51
           4       0.84      0.77      0.80        48
           5       0.62      0.53      0.57        57
           6       0.53      0.66      0.59        44
           7       0.66      0.44      0.53        43
           8       0.51      0.58      0.54        43
           9       0.68      0.66      0.67        58

   micro avg       0.61      0.61      0.61       477
   macro avg       0.61      0.61      0.61       477
weighted avg       0.62      0.61      0.61       477

[[32  4  2  2  0  0  0  0  0  1]
 [13 22  5  3  0  0  0  0  0  0]
 [ 2  6 30  9  0  0  0  0  0  2]
 [ 4  4  7 29  2  1  2  1  1  0]
 [ 1  0  0  6 37  1  1  2  0  0]
 [ 1  2  2  2  0 30 12  2  3  3]
 [ 0  1  1  0  0  7 29  4  0  2]
 [ 0  0  0