### Predicting City House with Scikit-Learn

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('houses_to_rent.csv')
print(data.shape)

(6080, 14)


In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,0,1,240,3,3,4,-,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,3,1,73,2,2,1,12,acept,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,4,1,19,1,1,0,-,not acept,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


In [4]:
data.drop(data.columns[0], axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,-,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,1,73,2,2,1,12,acept,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,1,19,1,1,0,-,not acept,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


In [6]:
data['floor'].replace(to_replace='-', value=0, inplace=True)

In [7]:
data.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,1,73,2,2,1,12,acept,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,1,19,1,1,0,0,not acept,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


In [8]:
data['animal'].replace(to_replace='not acept', value=0, inplace=True)
data['animal'].replace(to_replace='acept', value=1, inplace=True)

In [9]:
data.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,1,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,0,64,2,1,1,10,1,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,1,443,5,5,4,3,1,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,1,73,2,2,1,12,1,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,1,19,1,1,0,0,0,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


In [10]:
data['furniture'].replace(to_replace='not furnished', value=0, inplace=True)
data['furniture'].replace(to_replace='furnished', value=1, inplace=True)

In [11]:
for col in ['hoa', 'rent amount', 'property tax', 'fire insurance', 'total']:
    data[col].replace(to_replace='R\$', value='', regex=True, inplace=True)
    data[col].replace(to_replace=',', value='', regex=True, inplace=True)

In [12]:
data.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,1,1,0,8000,1000,121,9121
1,0,64,2,1,1,10,1,0,540,820,122,11,1493
2,1,443,5,5,4,3,1,1,4172,7000,1417,89,12680
3,1,73,2,2,1,12,1,0,700,1250,150,16,2116
4,1,19,1,1,0,0,0,0,0,1200,41,16,1257


In [13]:
data['hoa'].replace(to_replace='Sem info', value='0', inplace=True)

In [14]:
data['hoa'].replace(to_replace='Incluso', value='0', inplace=True)
data['property tax'].replace(to_replace='Incluso', value='0', inplace=True)

In [15]:
data = data.astype(dtype=np.int64)

In [16]:
data

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,1,1,0,8000,1000,121,9121
1,0,64,2,1,1,10,1,0,540,820,122,11,1493
2,1,443,5,5,4,3,1,1,4172,7000,1417,89,12680
3,1,73,2,2,1,12,1,0,700,1250,150,16,2116
4,1,19,1,1,0,0,0,0,0,1200,41,16,1257
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,1,50,2,1,1,2,1,0,420,1150,0,15,1585
6076,1,84,2,2,1,16,0,1,768,2900,63,37,3768
6077,0,48,1,1,0,13,1,0,250,950,42,13,1255
6078,1,160,3,2,2,0,0,0,0,3500,250,53,3803


In [17]:
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,52,1,2,1,13,1,0,1100,5000,210,64,6374
1,1,120,2,3,2,19,1,1,0,9920,0,126,10050
2,1,57,2,2,1,15,1,0,520,1911,0,20,2541
3,1,94,3,2,3,5,1,0,820,2200,350,28,3398
4,1,50,2,1,0,1,1,0,54,2000,42,26,2122
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,0,55,1,1,0,5,1,0,473,500,17,7,997
6076,1,86,2,1,1,12,1,0,1000,3007,84,39,4130
6077,1,49,2,1,1,6,1,0,277,2100,104,27,2508
6078,1,60,2,1,0,6,1,1,450,1524,0,5,1979


In [18]:
y = data['city']
X = data.drop('city', axis=1)

In [19]:
X

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,52,1,2,1,13,1,0,1100,5000,210,64,6374
1,120,2,3,2,19,1,1,0,9920,0,126,10050
2,57,2,2,1,15,1,0,520,1911,0,20,2541
3,94,3,2,3,5,1,0,820,2200,350,28,3398
4,50,2,1,0,1,1,0,54,2000,42,26,2122
...,...,...,...,...,...,...,...,...,...,...,...,...
6075,55,1,1,0,5,1,0,473,500,17,7,997
6076,86,2,1,1,12,1,0,1000,3007,84,39,4130
6077,49,2,1,1,6,1,0,277,2100,104,27,2508
6078,60,2,1,0,6,1,1,450,1524,0,5,1979


In [20]:
y

0       1
1       1
2       1
3       1
4       1
       ..
6075    0
6076    1
6077    1
6078    1
6079    1
Name: city, Length: 6080, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from sklearn.preprocessing import MinMaxScaler

In [23]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [24]:
X

array([[0.00170759, 0.        , 0.11111111, ..., 0.0005733 , 0.09050445,
        0.01535856],
       [0.00447227, 0.11111111, 0.22222222, ..., 0.        , 0.18249258,
        0.02523922],
       [0.00191088, 0.11111111, 0.11111111, ..., 0.        , 0.02522255,
        0.00505591],
       ...,
       [0.00158562, 0.11111111, 0.        , ..., 0.00028392, 0.03560831,
        0.00496721],
       [0.00203285, 0.11111111, 0.        , ..., 0.        , 0.00296736,
        0.00354532],
       [0.00601724, 0.22222222, 0.22222222, ..., 0.00144417, 0.09940653,
        0.01998172]])

In [25]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.001708,0.000000,0.111111,0.083333,0.131313,1.0,0.0,0.005000,0.102737,0.000573,0.090504,0.015359
1,0.004472,0.111111,0.222222,0.166667,0.191919,1.0,1.0,0.000000,0.213100,0.000000,0.182493,0.025239
2,0.001911,0.111111,0.111111,0.083333,0.151515,1.0,0.0,0.002364,0.033445,0.000000,0.025223,0.005056
3,0.003415,0.222222,0.111111,0.250000,0.050505,1.0,0.0,0.003727,0.039928,0.000956,0.037092,0.007359
4,0.001626,0.111111,0.000000,0.000000,0.010101,1.0,0.0,0.000245,0.035442,0.000115,0.034125,0.003930
...,...,...,...,...,...,...,...,...,...,...,...,...
6075,0.001830,0.000000,0.000000,0.000000,0.050505,1.0,0.0,0.002150,0.001795,0.000046,0.005935,0.000906
6076,0.003090,0.111111,0.000000,0.083333,0.121212,1.0,0.0,0.004545,0.058031,0.000229,0.053412,0.009327
6077,0.001586,0.111111,0.000000,0.083333,0.060606,1.0,0.0,0.001259,0.037685,0.000284,0.035608,0.004967
6078,0.002033,0.111111,0.000000,0.000000,0.060606,1.0,1.0,0.002045,0.024764,0.000000,0.002967,0.003545


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [27]:
X_train.shape, y_train.shape

((4864, 12), (4864,))

In [28]:
X_test.shape, y_test.shape

((1216, 12), (1216,))

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [31]:
log_model = LogisticRegression(penalty='l2', verbose=1)
svm_model = SVC(verbose=1)
nn_model = MLPClassifier(hidden_layer_sizes=(16,16), activation='relu', solver='adam', verbose=1)

In [32]:
log_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           13     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  3.60403D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   13     25     26      1     0     0   3.907D-05   3.721D-01
  F =  0.37209085346170162     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Iteration 1, loss = 0.67489916
Iteration 2, loss = 0.57825453
Iteration 3, loss = 0.48515365
Iteration 4, loss = 0.42172622
Iteration 5, loss = 0.40073269
Iteration 6, loss = 0.39655660
Iteratio

 This problem is unconstrained.


Iteration 10, loss = 0.38771307
Iteration 11, loss = 0.38585403
Iteration 12, loss = 0.38427278
Iteration 13, loss = 0.38299221
Iteration 14, loss = 0.38212390
Iteration 15, loss = 0.38112280
Iteration 16, loss = 0.38009645
Iteration 17, loss = 0.37915364
Iteration 18, loss = 0.37814463
Iteration 19, loss = 0.37729448
Iteration 20, loss = 0.37637470
Iteration 21, loss = 0.37554505
Iteration 22, loss = 0.37441896
Iteration 23, loss = 0.37347322
Iteration 24, loss = 0.37261131
Iteration 25, loss = 0.37159963
Iteration 26, loss = 0.37069761
Iteration 27, loss = 0.36950721
Iteration 28, loss = 0.36862125
Iteration 29, loss = 0.36730778
Iteration 30, loss = 0.36643832
Iteration 31, loss = 0.36542807
Iteration 32, loss = 0.36458422
Iteration 33, loss = 0.36330369
Iteration 34, loss = 0.36230005
Iteration 35, loss = 0.36151492
Iteration 36, loss = 0.36051904
Iteration 37, loss = 0.35967284
Iteration 38, loss = 0.35861652
Iteration 39, loss = 0.35756343
Iteration 40, loss = 0.35673551
Iteratio

In [33]:
print(log_model.score(X_test, y_test))
print(svm_model.score(X_test, y_test))
print(nn_model.score(X_test, y_test))

0.875
0.875
0.9087171052631579


In [34]:
data[data.columns[0]].sum()/data.shape[0]

0.8633223684210526

In [35]:
from sklearn.metrics import f1_score

In [36]:
log_pred = log_model.predict(X_test)
svm_pred = svm_model.predict(X_test)
nn_pred = nn_model.predict(X_test)

In [40]:
print(f1_score(log_pred, y_test))
print(f1_score(svm_pred, y_test))
print(f1_score(nn_pred, y_test))

0.9333333333333333
0.9333333333333333
0.9497054825555052
