In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import IsolationForest

from sklearn.metrics import r2_score , mean_absolute_error

from catboost import CatBoostRegressor

In [3]:
data = pd.read_csv('./House_Price_dataset.csv')
data.head()

Unnamed: 0,property_id,location_id,page_url,property_type,price,location,city,province_name,latitude,longitude,baths,area,purpose,bedrooms,date_added,agency,agent,Area Type,Area Size,Area Category
0,237062,3325,https://www.zameen.com/Property/g_10_g_10_2_gr...,Flat,10000000,G-10,Islamabad,Islamabad Capital,33.67989,73.01264,2,4 Marla,For Sale,2,02-04-2019,,,Marla,4.0,0-5 Marla
1,346905,3236,https://www.zameen.com/Property/e_11_2_service...,Flat,6900000,E-11,Islamabad,Islamabad Capital,33.700993,72.971492,3,5.6 Marla,For Sale,3,05-04-2019,,,Marla,5.6,5-10 Marla
2,386513,764,https://www.zameen.com/Property/islamabad_g_15...,House,16500000,G-15,Islamabad,Islamabad Capital,33.631486,72.926559,6,8 Marla,For Sale,5,07-17-2019,,,Marla,8.0,5-10 Marla
3,656161,340,https://www.zameen.com/Property/islamabad_bani...,House,43500000,Bani Gala,Islamabad,Islamabad Capital,33.707573,73.151199,4,2 Kanal,For Sale,4,04-05-2019,,,Kanal,2.0,1-5 Kanal
4,841645,3226,https://www.zameen.com/Property/dha_valley_dha...,House,7000000,DHA Defence,Islamabad,Islamabad Capital,33.492591,73.301339,3,8 Marla,For Sale,3,07-10-2019,Easy Property,Muhammad Junaid Ceo Muhammad Shahid Director,Marla,8.0,5-10 Marla


In [4]:
delLists = ['property_id' , 'page_url' , 'agency' , 'agent' , 'Area Category', 'Area Type' , 'Area Size' , 'area' , 'date_added' , 'location' , 'location_id' , 'province_name' , 'purpose']

In [5]:
data.drop(delLists , axis=1 , inplace=True)

In [6]:
categorical_features = ['property_type', 'city']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

In [7]:
encoded_array = preprocessor.fit_transform(data)
onehot_columns = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features)
remaining_columns = [col for col in data.columns if col not in categorical_features]
all_columns = list(onehot_columns) + remaining_columns
data = pd.DataFrame(encoded_array , columns=all_columns)

In [8]:
data['lastPrice'] = data['price']
data.drop(['price'] , axis=1 , inplace=True)

In [9]:
iso_forest = IsolationForest(contamination=0.05, random_state=1441)
outlier_predictions = iso_forest.fit_predict(data[['lastPrice']])
data = data[outlier_predictions != -1]

In [10]:
x = data.iloc[: , :-1]
y = data.iloc[: , -1]

y = y.values.reshape(-1, 1)

In [11]:
numerical_columns = ['latitude' , 'longitude' , 'baths' , 'bedrooms']
scaler = StandardScaler()
x[numerical_columns] = scaler.fit_transform(x[numerical_columns])
y = scaler.fit_transform(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[numerical_columns] = scaler.fit_transform(x[numerical_columns])


In [12]:
x_train , x_test , y_train , y_test = train_test_split(x , y , random_state=1441)

In [15]:
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.01,
    depth=12, 
    l2_leaf_reg=10,  
    loss_function='RMSE',  
    eval_metric='R2',
    boosting_type='Plain',
    bagging_temperature=3,
    random_seed=1441,
    early_stopping_rounds=100,
    verbose=200,
    task_type='GPU', 
    max_bin=254, 
    grow_policy="Depthwise",
    use_best_model=True 
)

In [16]:
model.fit(x_train , y_train , eval_set=(x_test , y_test))

Default metric period is 5 because R2 is/are not implemented for GPU
Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.0102722	test: 0.0099167	best: 0.0099167 (0)	total: 47.1ms	remaining: 1m 34s
200:	learn: 0.5951093	test: 0.5615440	best: 0.5615440 (200)	total: 5.89s	remaining: 52.7s
400:	learn: 0.6403397	test: 0.5947128	best: 0.5947128 (400)	total: 10.4s	remaining: 41.4s
600:	learn: 0.6565823	test: 0.6029190	best: 0.6029190 (600)	total: 14.5s	remaining: 33.7s
800:	learn: 0.6671901	test: 0.6069806	best: 0.6069806 (800)	total: 18.6s	remaining: 27.9s
1000:	learn: 0.6750581	test: 0.6091844	best: 0.6091844 (1000)	total: 22.8s	remaining: 22.8s
1200:	learn: 0.6815340	test: 0.6107450	best: 0.6107450 (1200)	total: 27.1s	remaining: 18s
1400:	learn: 0.6871936	test: 0.6119682	best: 0.6119682 (1400)	total: 31.5s	remaining: 13.5s
1600:	learn: 0.6920736	test: 0.6126597	best: 0.6126641 (1597)	total: 36s	remaining: 8.98s
1800:	learn: 0.6961518	test: 0.6131205	best: 0.6131205 (1800)	total: 40.5s	remaining: 4.48s
1999:	learn: 0.6994769	test: 0.6133080	best: 0.6133157 (1995)	total: 44.9s	remaining: 0us
bestTe

<catboost.core.CatBoostRegressor at 0x7062d7b974a0>

In [17]:
pred = model.predict(x_test)
print(r2_score(y_test , pred))
print(mean_absolute_error(y_test , pred))

0.6133157367488277
0.36469287276954593


In [18]:
model.save_model('Model.h5')