In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import re

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.column_name]

In [4]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

In [17]:

from google.colab import files
uploaded = files.upload()


data = pd.read_csv('DATASET.csv')

data['DESCRIPTION'] = data['DESCRIPTION'].apply(clean_text)

y = pd.to_numeric(data['PRICE'], errors='coerce')


Saving DATASET.csv to DATASET.csv


In [18]:
mask = ~y.isna()
data = data[mask]
y = y[mask]

X = data.drop(columns=['PRICE'])

X['is_rent'] = X['PURPOSE'].apply(lambda x: 1 if 'rent' in str(x).lower() else 0)


In [19]:
categorical_features = ['TYPE', 'PURPOSE', 'LOCATION']
numeric_features = ['AREA', 'BUILD IN YEAR', 'BEDROOMS', 'BATHROOMS', 'PARKING SPACES', 'is_rent']

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [20]:
preprocessor_structured = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


text_pipeline = Pipeline([
    ('selector', ColumnSelector('DESCRIPTION')),
    ('tfidf', TfidfVectorizer(max_features=500, ngram_range=(1, 2)))
])

In [21]:
full_preprocessor = FeatureUnion([
    ('structured', preprocessor_structured),
    ('text', text_pipeline)
])


model = Pipeline([
    ('features', full_preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


model.fit(X_train, y_train)
y_pred = model.predict(X_test)



y_train_pred = model.predict(X_train)


In [28]:

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Mean Absolute Error (MAE): {mae:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
results = X_test.copy()
results['True_Price'] = y_test
results['Predicted_Price'] = y_pred

print(results[['True_Price', 'Predicted_Price']].head(10))

results.to_csv('predictions_output.csv', index=False)

Mean Absolute Error (MAE): 11,743,705.55
Root Mean Squared Error (RMSE): 49,127,917.52
       True_Price  Predicted_Price
15218  45000000.0     4.394000e+07
10949  23900000.0     2.685850e+07
7781    9500000.0     1.248700e+07
17440   6000000.0     6.464000e+06
3441   48000000.0     3.927000e+07
15978     95000.0     9.197000e+04
12038  75000000.0     7.965875e+07
8765     250000.0     2.993200e+05
13182     82000.0     8.147145e+04
15293     42000.0     7.010000e+04


In [36]:
from sklearn.metrics import r2_score
train_r2 = r2_score(y_train, y_train_pred)
print(f"Train R² Score: {train_r2:.4f}")

test_r2 = r2_score(y_test, y_pred)
print(f"Test R² Score: {test_r2:.4f}")


Train R² Score: 0.9794
Test R² Score: 0.8762


In [38]:
n = len(y_train)
k = X_train.shape[1]
adjusted_r2_train = 1 - (1 - train_r2) * ((n - 1) / (n - k - 1))


print(f"Adjusted R² Score for train: {adjusted_r2_train:.4f}")

n = len(y_test)
k = X_test.shape[1]
adjusted_r2_test = 1 - (1 - test_r2) * ((n - 1) / (n - k - 1))


print(f"Adjusted R² Score for test: {adjusted_r2_test:.4f}")

Adjusted R² Score for train: 0.9794
Adjusted R² Score for test: 0.8759


In [30]:
import joblib

joblib.dump(model, 'house_price_model.pkl')
print("Model saved to house_price_model.pkl")


Model saved to house_price_model.pkl


In [31]:
from google.colab import files

files.download('house_price_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>