In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

In [2]:
df = pd.read_csv('../cleaning/output/processed_data.tsv', sep='\t')

X = df.drop(['price', 'title', 'province', 'url_id'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=128)

In [3]:
# print the first 5 rows of the training set
print(X_train.head())

       area  number_of_bedrooms  number_of_toilets  legal        lat  \
16963  52.0                   4                  3      2  21.044980   
6543   34.0                   3                  3      2  20.998491   
4356   50.0                   6                  6      2  20.962283   
13351  43.0                   4                  3      2  21.010923   
10508  52.0                   6                  5     -1  20.970895   

              lon  district  
16963  105.810730        10  
6543   105.851598        14  
4356   105.768498         5  
13351  105.818020         3  
10508  105.827205         8  


In [4]:
print(y)

0        16.8
1        13.2
2         8.0
3        16.5
4        11.0
         ... 
17751    15.0
17752     8.1
17753     9.8
17754     8.0
17755     6.0
Name: price, Length: 17756, dtype: float64


In [5]:
model = XGBRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [6]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}')

Mean Absolute Error: 1.628997621470749
R-squared: 0.7461772398535751
Mean Absolute Percentage Error (MAPE): 0.1635992626765103


In [7]:
import joblib

joblib.dump(model, './saved/xgb_model.joblib')

print("Model saved successfully")

Model saved successfully
