In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly_express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from joblib import dump
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_validate

In [70]:
url = "https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12
3,SACRAMENTO,Residential,2,1,852,130904.95
4,SACRAMENTO,Residential,2,1,797,120266.19


In [None]:
df.SquareFeet.describe()

In [None]:
cities = df.City.unique().tolist()

In [None]:
resident_types = df.Type.unique().tolist()

In [71]:
df.describe(include='object')

Unnamed: 0,City,Type
count,814,814
unique,36,3
top,SACRAMENTO,Residential
freq,424,759


In [72]:
df.isna().sum()   # to find missing values

City          0
Type          0
Beds          0
Baths         0
SquareFeet    0
Price         0
dtype: int64

In [73]:
cityenc = OneHotEncoder(drop='first')
city_dummies = cityenc.fit_transform(df[['City']]).toarray()

res_type = OneHotEncoder(drop='first')
res_dummies = res_type.fit_transform(df[['Type']]).toarray()

In [74]:
df.drop(columns=['City','Type'], inplace=True)
df = pd.concat([df, pd.DataFrame(city_dummies), pd.DataFrame(res_dummies)], axis=1)
df.head()

Unnamed: 0,Beds,Baths,SquareFeet,Price,0,1,2,3,4,5,...,27,28,29,30,31,32,33,34,0.1,1.1
0,2,1,836,138159.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,1,1167,167541.46,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1,796,119095.12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,1,852,130904.95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,797,120266.19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [75]:
X = df.drop(columns=['Price'])
y = df['Price']

In [76]:
X.columns = X.columns.astype(str)

In [77]:
scaler = StandardScaler()
x = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

Training

In [78]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print("Decision Tree Regression:")
print('MSE',mean_squared_error(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('R2', r2_score(y_test,y_pred))

Decision Tree Regression:
MSE 1381077044.0749595
MAE 15795.767137014313
R2 0.835140477352338


In [79]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print("Decision Tree Regression:")
print('MSE',mean_squared_error(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('R2', r2_score(y_test,y_pred))

Decision Tree Regression:
MSE 1381077044.0749595
MAE 15795.767137014313
R2 0.835140477352338


In [80]:
for i in range(5,50,5):
    model2 = RandomForestRegressor(max_depth=i)
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)
    print(f'Random Forest Regression with max_depth:{i}')
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    print('-'*25)

Random Forest Regression with max_depth:5
MSE: 1264098807.655385
MAE: 19769.366922390986
R2: 0.849104199578431
-------------------------
Random Forest Regression with max_depth:10
MSE: 1172907083.0059826
MAE: 15130.087235928953
R2: 0.8599897792494675
-------------------------
Random Forest Regression with max_depth:15
MSE: 1124307246.895158
MAE: 14570.555447808092
R2: 0.8657911542099438
-------------------------
Random Forest Regression with max_depth:20
MSE: 1099103867.3461285
MAE: 14661.149708581466
R2: 0.8687996881214924
-------------------------
Random Forest Regression with max_depth:25
MSE: 1175117931.7051997
MAE: 14622.397988601608
R2: 0.8597258696705177
-------------------------
Random Forest Regression with max_depth:30
MSE: 1155501928.0952811
MAE: 14537.129624413717
R2: 0.8620674370763766
-------------------------
Random Forest Regression with max_depth:35
MSE: 1051155946.3733352
MAE: 14264.706277259436
R2: 0.8745232438039469
-------------------------
Random Forest Regression

Cross Validation

In [81]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate

In [82]:
model1 = DecisionTreeRegressor()
cv = ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)
cv_result = cross_validate(model1,X,y ,cv=cv, scoring=('r2','neg_mean_absolute_error'))

In [83]:
results_df = pd.DataFrame(cv_result)
results_df

Unnamed: 0,fit_time,score_time,test_r2,test_neg_mean_absolute_error
0,0.083878,0.050951,0.947123,-11682.757055
1,0.083134,0.050406,0.874966,-15611.457464
2,0.093781,0.029301,0.817775,-14571.271329
3,0.112927,0.089597,0.86862,-15715.607853
4,0.126896,0.076426,0.833936,-17765.784785
5,0.167667,0.091498,0.85803,-16733.896626
6,0.152992,0.088974,0.921047,-14937.660777
7,0.185896,0.070702,0.835921,-14874.303354
8,0.250582,0.124752,0.870738,-17105.725399
9,0.166397,0.039938,0.92309,-13244.727526


In [84]:
results_df['test_r2'].describe()

count    10.000000
mean      0.875125
std       0.042778
min       0.817775
25%       0.841448
50%       0.869679
75%       0.909527
max       0.947123
Name: test_r2, dtype: float64

In [None]:
from sklearn.model_selection import learning_curve

lc_results = learning_curve(model2, X, y, cv=5, train_sizes=np.linspace(0.1,1.0,10))

In [None]:
test_scores_df = pd.DataFrame(lc_results[2])
test_scores_df['train_record'] = lc_results[0]
test_scores_df['mean_test_score'] = test_scores_df.loc[:,[0,1,2,3,4]].mean(axis=1)
test_scores_df

Unnamed: 0,0,1,2,3,4,train_record,mean_test_score
0,0.595492,0.420352,0.220233,0.539924,0.358554,65,0.426911
1,0.919419,0.750454,0.690402,0.63577,0.692496,130,0.737708
2,0.916011,0.768865,0.716532,0.641567,0.707135,195,0.750022
3,0.920723,0.77811,0.731018,0.679455,0.717885,260,0.765438
4,0.936313,0.807821,0.780027,0.738197,0.722201,325,0.796912
5,0.930493,0.766496,0.774506,0.739033,0.723926,390,0.786891
6,0.934089,0.780319,0.779954,0.749975,0.720267,455,0.792921
7,0.930688,0.833574,0.790301,0.821156,0.732943,520,0.821732
8,0.941738,0.855997,0.831564,0.824189,0.735807,585,0.837859
9,0.949567,0.873002,0.831914,0.825576,0.729356,651,0.841883


In [None]:
px.line(test_scores_df, x='train_record', y=[0,1,2,3,4], title='Learning Curve')

In [None]:
model_dict= {
    'city_encoder': cityenc,
    'resident_type_encoder': res_type,
    'scaler': scaler,
    'model': model2,
    'cities': cities,
    'resident_types': resident_types,
    'description':'''
    Model trained on the house_pricing dataset.
    cities = get from cities list
    resident_types = get from resident_types list
    ''',
    'author': 'Digipodium',
}
dump(model_dict,'../../models/house_pricing_model.pk') 

NameError: name 'cities' is not defined