In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error #Using MAE incase of outliers
import matplotlib.pyplot as plt


from IPython.core.interactiveshell import InteractiveShell # Importing so we can run multiple lines in one cell
InteractiveShell.ast_node_interactivity = "all" # Code so multiple lines in one cell can be ran simultaenously
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('miami-housing.csv')

 **Data from: https://www.kaggle.com/datasets/deepcontractor/miami-housing-dataset**

The dataset contains the following columns:

    PARCELNO: unique identifier for each property. About 1% appear multiple times.
    SALE_PRC: sale price ($)
    LND_SQFOOT: land area (square feet)
    TOT_LVG_AREA: floor area (square feet)
    SPEC_FEAT_VAL: value of special features (e.g., swimming pools) ($)
    RAIL_DIST: distance to the nearest rail line (an indicator of noise) (feet)
    OCEAN_DIST: distance to the ocean (feet)
    WATER_DIST: distance to the nearest body of water (feet)
    CNTR_DIST: distance to the Miami central business district (feet)
    SUBCNTR_DI: distance to the nearest subcenter (feet)
    HWY_DIST: distance to the nearest highway (an indicator of noise) (feet)
    age: age of the structure
    avno60plus: dummy variable for airplane noise exceeding an acceptable level
    structure_quality: quality of the structure
    month_sold: sale month in 2016 (1 = jan)
    LATITUDE
    LONGITUDE


In [None]:
df.head()

In [None]:
df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr_matrix = df.corr()
# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix.round(2), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show();
# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show();

In [None]:
# Splittin the data into features and target variables
X = df.drop('SALE_PRC', axis=1)
y = df['SALE_PRC']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Get all column names
all_columns = df.columns.tolist()

# List of all column names except SALE_PRC
feature_names = [col for col in all_columns if col != 'SALE_PRC']

# List containing only SALE_PRC
target_name = ['SALE_PRC']

In [None]:
X.head()

In [None]:
# Check for null/missing values
X.isnull().sum()

# ▶ Initial XGBoost Model

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
import xgboost as xgb

# Create the XGBoost regressor with initial parameters
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

# Predicting
preds = xg_reg.predict(X_test)

# Evaluating
mae = mean_absolute_error(y_test, preds)
print("MAE for initial model: %f" % (mae))

# ▶ GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid to search
param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'gamma': [0, 0.1, 0.2],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [5, 7, 9],
    'n_estimators': [50, 100, 200],
    'alpha': [5, 10, 15]
}

# Create the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror')

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator to make predictions
best_preds = grid_search.best_estimator_.predict(X_test)

# Evaluate the best estimator
mae_bestpreds = mean_absolute_error(y_test, best_preds)
print("MAE for best estimator: %f" % (mae_bestpreds))

# ▶ Improved XGBoost Model

In [None]:
# Create the XGBoost regressor with initial parameters
xg_reg_improved = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, alpha = 5, n_estimators = 200)

In [None]:
# Fitting data to improved model
xg_reg_improved.fit(X_train, y_train)

# Predicting
preds_improved = xg_reg_improved.predict(X_test)

# Evaluating
mae_improved = mean_absolute_error(y_test, preds_improved)
print("MAE for improved model: %f" % (mae_improved))

# ▶ Prediction vs Actual Values Visualization

In [None]:
print(len(y_test), len(preds), len(preds_improved))

In [None]:
# Plotting without limiting to a sample size, including all data
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual', color='blue', marker='o', linestyle='None', markersize=5)
plt.plot(preds, label='Initial Predictions', color='red', linestyle='--')
plt.plot(preds_improved, label='Improved Predictions', color='green', linestyle='-.')
plt.title('Comparison of Actual vs. Predicted Values')
plt.xlabel('Data Point Index')
plt.ylabel('Target Variable')
plt.legend()
plt.show();
