In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pickle
import ipywidgets as widgets
from IPython.display import display

In [2]:

data = pd.read_csv('Expanded_Real_Estate_Data.csv')
# Load the data


# Print the column names to debug
print("Columns in the DataFrame:", data.columns.tolist())
data.head()

Columns in the DataFrame: ['No', 'X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude', 'Y house price of unit area']


Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [3]:
data.shape

(3000, 8)

In [4]:

data = data.iloc[:, 1:]
# Print the first few rows and column names
print(data.head())  # Shows the first 5 rows of the dataset
print("Columns in the dataset:", data.columns)  # Lists all column names


   X1 transaction date  X2 house age  X3 distance to the nearest MRT station  \
0             2012.917          32.0                                84.87882   
1             2012.917          19.5                               306.59470   
2             2013.583          13.3                               561.98450   
3             2013.500          13.3                               561.98450   
4             2012.833           5.0                               390.56840   

   X4 number of convenience stores  X5 latitude  X6 longitude  \
0                               10     24.98298     121.54024   
1                                9     24.98034     121.53951   
2                                5     24.98746     121.54391   
3                                5     24.98746     121.54391   
4                                5     24.97937     121.54245   

   Y house price of unit area  
0                        37.9  
1                        42.2  
2                        47.3  


In [7]:
print("Missing values in each column:\n", data.isnull().sum())

# Fill missing values with column means (if any)
data.fillna(data.mean(), inplace=True)

Missing values in each column:
 X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64


In [8]:
# Set up features and target variable
features = ['X2 house age', 'X3 distance to the nearest MRT station', 
            'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']
target = 'Y house price of unit area'


In [9]:
if target not in data.columns:
    print(f"Column '{target}' not found in the data. Available columns are: {data.columns}")
    raise KeyError(f"Target column '{target}' does not exist in the DataFrame.")

In [10]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
data.fillna(data.mean(), inplace=True)

    # Split the data into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
model = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 17.462210852707994
R-squared: 0.9098225877937323


In [17]:
train_score = model.score(X_train, y_train)  # Score on training data
val_score = model.score(X_test, y_test) 

In [18]:
print('Train Score: ', train_score)
print('Validation Score: ', val_score)

Train Score:  0.9533394218274495
Validation Score:  0.9098225877937323


In [19]:
# Save the model using pickle
with open('real_estate_rf_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Load the saved model
with open('real_estate_rf_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [20]:
import ipywidgets as widgets
from IPython.display import display
import pickle


# Load the trained model (make sure it's been saved in a previous cell)
with open('real_estate_rf_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Widgets for user input
house_age_widget = widgets.FloatText(description="House Age:")
distance_widget = widgets.FloatText(description="Distance to MRT:")
convenience_widget = widgets.IntText(description="Convenience Stores:")
latitude_widget = widgets.FloatText(description="Latitude:")
longitude_widget = widgets.FloatText(description="Longitude:")

# Display the widgets
display(house_age_widget, distance_widget, convenience_widget, latitude_widget, longitude_widget)

# Function to make predictions based on user input
def predict_price(b):
    # Collect inputs from widgets
    house_age = house_age_widget.value
    distance_to_mrt = distance_widget.value
    num_convenience_stores = convenience_widget.value
    latitude = latitude_widget.value
    longitude = longitude_widget.value

    # Ensure all values are entered
    if any(v is None for v in [house_age, distance_to_mrt, num_convenience_stores, latitude, longitude]):
        print("Please fill in all input fields.")
        return

    # Prepare the input data
    input_features = pd.DataFrame([[house_age, distance_to_mrt, num_convenience_stores, latitude, longitude]], 
                                  columns=features)
    # Scale the input features
    input_features_scaled = scaler.transform(input_features)

    # Predict using the loaded model
    predicted_price = loaded_model.predict(input_features_scaled)[0]
    print(f"Predicted house price per unit area: {predicted_price:.2f}")

# Button to trigger prediction
predict_button = widgets.Button(description="Predict House Price")
predict_button.on_click(predict_price)
display(predict_button)

FloatText(value=0.0, description='House Age:')

FloatText(value=0.0, description='Distance to MRT:')

IntText(value=0, description='Convenience Stores:')

FloatText(value=0.0, description='Latitude:')

FloatText(value=0.0, description='Longitude:')

Button(description='Predict House Price', style=ButtonStyle())

Predicted house price per unit area: 46.10
