

```
# This is formatted as code
```

# TUTO 1 Q 10


In [None]:
# Step 1: Import Libraries
import pandas as pd
from urllib.request import urlretrieve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 2: Load the Dataset
url = 'https://drive.google.com/uc?id=1i_MkO3HHP5vVypi-ogEJ7elg5LpfiOpx'
urlretrieve(url, 'melb_data.csv')
data = pd.read_csv('melb_data.csv')

print("Dataset loaded successfully!")
print("Shape of data:", data.shape)
print("Columns:", data.columns.tolist())
print()

# Step 3: Basic Data Cleaning
# Drop rows with missing target (Price)
data = data.dropna(subset=['Price'])

# Select useful features and target
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'Type']
X = data[features]
y = data['Price']

# Step 4: Handle Categorical Feature (Type)
column_trans = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), ['Type'])],
    remainder='passthrough'
)

X = column_trans.fit_transform(X)

# Step 5: Split into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature Scaling (Min-Max)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 7: Build Two Models
model_lr = LinearRegression()
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 8: Train Both Models
model_lr.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Step 9: Evaluate Using Cross-Validation (5-fold)
cv_lr = -cross_val_score(model_lr, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rf = -cross_val_score(model_rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

print("Cross-Validation RMSE (Linear Regression):", np.mean(cv_lr))
print("Cross-Validation RMSE (Random Forest):", np.mean(cv_rf))
print()

# Step 10: Test Set Evaluation
y_pred_lr = model_lr.predict(X_test)
y_pred_rf = model_rf.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Test RMSE (Linear Regression):", rmse_lr)
print("Test RMSE (Random Forest):", rmse_rf)
print()

# Step 11: Model Comparison
if rmse_rf < rmse_lr:
    print("✅ Random Forest performs better than Linear Regression.")
else:
    print("✅ Linear Regression performs better than Random Forest.")