In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from joblib import dump , load
import plotly.express as px

In [3]:
df=pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [4]:
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2,random_state=1)

In [5]:
print("random Forest")
model2 = RandomForestRegressor(max_depth=25)
model2.fit(xtrain,ytrain)
print("score:",model2.score(xtest,ytest)*100)
pred = model2.predict(X)
print("mse:",mean_squared_error(y,pred))
print("mse:",mean_absolute_error(y,pred))


random Forest
score: 71.11673088620385
mse: 877676719.4740965
mse: 15348.584853040153


In [6]:
forest_score = cross_val_score(model2,X,y,cv=6)
print(forest_score,f"average:{forest_score.mean():.2f}",f"std:{forest_score.std():.2f}")

[0.81403443 0.71038622 0.69323324 0.72922344 0.50416934 0.61874501] average:0.68 std:0.10


GRIDSEARCHCV

In [7]:
params ={ 
    'n_estimators' : list(range(100,501,200)),
    'criterion' :["squared_error","absolute_error","poisson"],
    'max_depth': list(range(5,51,25)),

}
params

{'n_estimators': [100, 300, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 30]}

In [8]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=2)


In [9]:
grid.fit(X,y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 30],
                         'n_estimators': [100, 300, 500]},
             verbose=2)

In [10]:
gf=pd.DataFrame(grid.cv_results_)

In [11]:
gf.sort_values(by='rank_test_score',inplace=True)

In [12]:
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,0.589216,0.060703,0.011216,0.003659,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.828736,0.76509,0.617126,0.736984,0.088646,1
7,1.752221,0.283561,0.042667,0.009977,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824968,0.765577,0.6159,0.735482,0.087964,2
8,3.904739,0.059004,0.086254,0.022362,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824175,0.767108,0.613024,0.734769,0.089183,3
0,0.202667,0.009978,0.010667,0.003771,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801349,0.76824,0.615578,0.728389,0.080907,4
1,0.704001,0.066292,0.072,0.022628,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.799336,0.767928,0.615232,0.727499,0.080413,5
2,1.11888,0.074685,0.071989,0.011349,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801197,0.768456,0.61113,0.726928,0.082965,6
9,0.712524,0.013909,0.018713,0.003831,absolute_error,30,100,"{'criterion': 'absolute_error', 'max_depth': 3...",0.762739,0.757398,0.582509,0.700882,0.083731,7
10,2.29655,0.030863,0.038403,0.004715,absolute_error,30,300,"{'criterion': 'absolute_error', 'max_depth': 3...",0.759592,0.7578,0.583166,0.700186,0.082749,8
11,3.969192,0.514522,0.051697,0.008274,absolute_error,30,500,"{'criterion': 'absolute_error', 'max_depth': 3...",0.760407,0.759366,0.57799,0.699254,0.085748,9
4,0.716838,0.033201,0.040633,0.000895,squared_error,30,300,"{'criterion': 'squared_error', 'max_depth': 30...",0.754354,0.756336,0.57877,0.696487,0.083242,10


In [13]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5)

In [14]:
dump(grid.best_estimator_,"house_pricing_model_73.pkl")

['house_pricing_model_73.pkl']