In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



# Data Exploration

In [2]:
data = pd.read_csv('/kaggle/input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  924 non-null    int64  
 1   Address     924 non-null    object 
 2   Zip         924 non-null    object 
 3   Price       920 non-null    float64
 4   Area        924 non-null    int64  
 5   Room        924 non-null    int64  
 6   Lon         924 non-null    float64
 7   Lat         924 non-null    float64
dtypes: float64(3), int64(3), object(2)
memory usage: 57.9+ KB


In [4]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Price,Area,Room,Lon,Lat
count,924.0,920.0,924.0,924.0,924.0,924.0
mean,462.5,622065.4,95.952381,3.571429,4.888605,52.363326
std,266.880123,538994.2,57.447436,1.592332,0.05314,0.024028
min,1.0,175000.0,21.0,1.0,4.644819,52.291519
25%,231.75,350000.0,60.75,3.0,4.855834,52.352077
50%,462.5,467000.0,83.0,3.0,4.886818,52.364631
75%,693.25,700000.0,113.0,4.0,4.922337,52.377598
max,924.0,5950000.0,623.0,14.0,5.029122,52.423805


# Data Preprocessing

In [5]:
numeric_data = data.select_dtypes(include=['number'])
numeric_data = numeric_data.dropna()

X = numeric_data[['Area', 'Room', 'Lon', 'Lat']]
y = numeric_data['Price']

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Data Modeling

In [7]:
models = [
    ("Linear Regression", LinearRegression()),
    ("Lasso Regression", Lasso(alpha=0.1)),
    ("Ridge Regression", Ridge(alpha=0.1)),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor()),
    ("K-Nearest Neighbors", KNeighborsRegressor(n_neighbors=2)),
]

In [8]:
# Initialize lists to store model performance scores
mae_scores = []
mse_scores = []
rmse_scores = []
r2_scores = []

***|> cross validation***

In [9]:
# Perform cross-validation for each model
for name, model in models:
    kf = KFold(n_splits=3,shuffle=True,random_state=42)
    mae = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error').mean()
    mse = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error').mean()
    rmse = np.sqrt(mse)
    r2 = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()
    mae_scores.append(mae)
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

In [10]:
# Create a DataFrame to compare model performance
results_df = pd.DataFrame({
    "Model": [name for name, _ in models],
    "MAE": mae_scores,
    "MSE": mse_scores,
    "RMSE": rmse_scores,
    "R2": r2_scores
})

In [11]:
max = 0
for i in range(len(results_df['R2'])):
    if i==(len(results_df['R2']))-1:
        if results_df['R2'][i]>results_df['R2'][max]:
            max = i
    else:
        if results_df['R2'][i]>results_df['R2'][i+1]:
               max= i
print(f"Best Model : {results_df['Model'][max]} with accuracy : {results_df['R2'][max]}")

Best Model : Gradient Boosting with accuracy : 0.7180530449478173


***|> Training***

In [12]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

***|> Evaluation***

In [15]:
#Calculate training accuracy
y_train_pred = model.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)

In [16]:
# Testing
predicted_price = model.predict(X_test)

In [17]:
# Calculate testing scores
test_mae = mean_absolute_error(y_test, predicted_price)
test_mse = mean_squared_error(y_test, predicted_price)
test_rmse = mean_squared_error(y_test, predicted_price, squared=False)
test_r2 = r2_score(y_test, predicted_price)

In [18]:
print(f' \t SUMMARY \n Training accuracy : {train_r2} \n Testing accuracy : {test_r2}')

 	 SUMMARY 
 Training accuracy : 0.9500283665531262 
 Testing accuracy : 0.804322489427493
