In [1]:
#importing important libraries
import pandas as pd
import bz2
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pickle
import _pickle as cPickle

In [2]:
#reading data
rent_data = pd.read_csv('rent-ideal.csv')
rent_data

Unnamed: 0,bedrooms,bathrooms,latitude,longitude,price
0,1,1.0,40.7108,-73.9539,2400
1,2,1.0,40.7513,-73.9722,3800
2,2,1.0,40.7575,-73.9625,3495
3,3,1.5,40.7145,-73.9425,3000
4,0,1.0,40.7439,-73.9743,2795
...,...,...,...,...,...
48295,3,1.0,40.8433,-73.9396,2800
48296,2,1.0,40.8198,-73.9578,2395
48297,1,1.0,40.5765,-73.9554,1850
48298,2,1.0,40.7448,-74.0017,4195


In [3]:
#checking noise in data
rent_data.describe()

Unnamed: 0,bedrooms,bathrooms,latitude,longitude,price
count,48300.0,48300.0,48300.0,48300.0,48300.0
mean,1.508799,1.178313,40.750782,-73.972365,3438.29795
std,1.092232,0.42612,0.03956,0.029563,1401.422247
min,0.0,0.0,40.5712,-74.094,1025.0
25%,1.0,1.0,40.7281,-73.9917,2495.0
50%,1.0,1.0,40.7516,-73.9779,3100.0
75%,2.0,1.0,40.774,-73.9547,4000.0
max,8.0,10.0,40.9154,-73.7001,9999.0


In [4]:
#Reducing data since github has 25 mb upload limit
rent_data = rent_data.loc[0:21000, :]

In [5]:
#Splitting data
X = rent_data.drop('price', axis = 1)
y = rent_data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [6]:
#Creating a model
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': [1, 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300]}

rf = RandomForestRegressor(n_jobs=-1, oob_score=True) 

random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=10, cv=3, random_state=42)

random_search.fit(X_train, y_train)
model = random_search.best_estimator_

In [7]:
#Evaluating model
print (model.oob_score_)
print (model.score(X_test, y_test))

0.8242201660826639
0.8310139215564666


In [8]:
#Exporting model
with bz2.BZ2File("Regressor" + '.pbz2', 'w') as f: 
    cPickle.dump(model, f)