In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from joblib import dump , load
import plotly.express as px

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [3]:
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2,random_state=1)

In [4]:
print("random Forest")
model2 = RandomForestRegressor(max_depth=25)
model2.fit(xtrain,ytrain)
print("score:",model2.score(xtest,ytest)*100)
pred = model2.predict(X)
print("mse:",mean_squared_error(y,pred))
print("mse:",mean_absolute_error(y,pred))


random Forest
score: 72.39532660976641
mse: 858679398.2125674
mse: 15222.202081487178


In [5]:
forest_score = cross_val_score(model2,X,y,cv=6)
print(forest_score,f"average:{forest_score.mean():.2f}",f"std:{forest_score.std():.2f}")

[0.80127104 0.71506587 0.71029101 0.72556601 0.49323581 0.61986472] average:0.68 std:0.10


GRIDSEARCHCV

In [6]:
params ={ 
    'n_estimators' : list(range(100,501,200)),
    'criterion' :["squared_error","absolute_error","poisson"],
    'max_depth': list(range(5,51,25)),

}
params

{'n_estimators': [100, 300, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 30]}

In [7]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=2)


In [8]:
grid.fit(X,y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 30],
                         'n_estimators': [100, 300, 500]},
             verbose=2)

In [9]:
gf=pd.DataFrame(grid.cv_results_)

In [10]:
gf.sort_values(by='rank_test_score',inplace=True)

In [11]:
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,0.80719,0.007513,0.018668,0.00377,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823405,0.765896,0.619731,0.736344,0.085735,1
7,2.89472,0.446056,0.073917,0.023311,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824654,0.767699,0.61619,0.736181,0.087975,2
8,3.85412,0.067054,0.079143,0.010103,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.82352,0.767801,0.614188,0.73517,0.088519,3
1,0.976403,0.182775,0.058764,0.006884,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.803459,0.765791,0.617953,0.729067,0.080061,4
2,1.705682,0.358174,0.122713,0.030806,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.798999,0.768455,0.617532,0.728329,0.079331,5
0,0.384372,0.073329,0.016003,2e-06,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801988,0.768877,0.612434,0.727767,0.082665,6
11,6.626636,0.778239,0.084405,0.009033,absolute_error,30,500,"{'criterion': 'absolute_error', 'max_depth': 3...",0.770753,0.758165,0.579938,0.702952,0.087136,7
9,1.318921,0.125686,0.029763,0.010134,absolute_error,30,100,"{'criterion': 'absolute_error', 'max_depth': 3...",0.764796,0.757183,0.583457,0.701812,0.083747,8
10,4.178287,0.440697,0.061549,0.004212,absolute_error,30,300,"{'criterion': 'absolute_error', 'max_depth': 3...",0.763463,0.758841,0.577649,0.699984,0.086525,9
5,2.26715,0.43709,0.114894,0.013362,squared_error,30,500,"{'criterion': 'squared_error', 'max_depth': 30...",0.75808,0.754649,0.583177,0.698635,0.081653,10


In [12]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5)

In [13]:
dump(grid.best_estimator_,"house_pricing_model_73.pkl")

['house_pricing_model_73.pkl']