In [None]:
# 📚 Basic libraries
import pandas as pd
import numpy as np 

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

# 🤖 Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error

# Model Regression
from sklearn.linear_model import Lasso

<h2 style="color: #00aaff;">Data Extraction</h2>

In [None]:
# Data extraction
data = pd.read_csv('../dataset/king_country_houses_aa.csv')
df = data.copy()
df.head(10) # Exploring the data

In [None]:
num_df = df.drop(columns = ['id', 'date'])
num_df

In [None]:
# select continuous variables
continuous_df = num_df.loc[:, num_df.nunique() > 20]
continuous_df

In [None]:
# Drop the following columns: yr_built,	yr_renovated, zipcode, bathrooms
continuous_df = continuous_df.drop(columns=['yr_built', 'yr_renovated', 'zipcode', 'bathrooms'])

<h2 style="color: #00aaff;">Dealing with multicollinearity</h2>

In [None]:
# Move the price column (target) to the last position for better analysis 
target = continuous_df.pop("price")
continuous_df["price"] = target
continuous_df.head(5)

In [None]:
num_corr = round(continuous_df.corr(), 2)

In [None]:
# Correlation Matrix-Heatmap Plot
mask = np.zeros_like(num_corr)
mask[np.triu_indices_from(mask)] = True # optional, to hide repeat half of the matrix

f, ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale=1.5) # increase font size

ax = sns.heatmap(num_corr, mask=mask, annot=True, annot_kws={"size": 12}, linewidths=.5, cmap="coolwarm", fmt=".2f", ax=ax) # round to 2 decimal places
ax.set_title("Dealing with Multicollinearity", fontsize=20) # add title
plt.show()

- `sqft_lot` has a high correlation with `sqft_lot15` for doing the regression we will take it out
- `zipcode` has a moderate negative relationship with `long`

In [None]:
ml_df = continuous_df.drop(columns=['sqft_lot'])
ml_df

<h1 style="color: #00aaff;">01 | Modeling</h1>

### X-y Split

In [None]:
# Split X and y
X = ml_df.drop(columns="price")
y = ml_df.price

### Train-test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3, random_state=42) 

In [None]:
print(f'100% of our data: {len(ml_df)}.')
print(f'70% for training data: {len(X_train)}.')
print(f'30% for test data: {len(X_test)}.')

<h2 style="color: #00aaff;">Selecting the model: Lasso Regression</h2>

In [None]:
model = Lasso()

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions_lasso = model.predict(X_test)

<h2 style="color: #00aaff;">Model Validation</h2>

In [None]:
r2_3_3 = r2_score(y_test, predictions_lasso)
RMSE_3_3 = root_mean_squared_error(y_test, predictions_lasso)
MSE_3_3 = mean_squared_error(y_test, predictions_lasso)
MAE_3_3 = mean_absolute_error(y_test, predictions_lasso)

In [None]:
# Create a dataframe with the metrics
lasso_metrics_df = pd.DataFrame({
    'Metrics': ['R2', 'RMSE', 'MSE', 'MAE'],
    'Values': [r2_3_3, RMSE_3_3, MSE_3_3, MAE_3_3],
})

# Set pandas display option to prevent scientific notation
pd.set_option('display.float_format', '{:.4f}'.format)
lasso_metrics_df

<h2 style="color: #00aaff;">Reporting</h2>

In [None]:
## dataframe with actual vs predictions
# Make a dataframe to compare
eval_df = pd.DataFrame({"actual": y_test, "pred": predictions_lasso})
eval_df["dif"] = abs(eval_df["actual"]-eval_df["pred"])
eval_df.reset_index(drop=True, inplace=True)
eval_df.head()

In [None]:
scatter_color = "#FF6347"
line_color = "#FF8C00"

plt.figure(figsize=(10, 6))

sns.regplot(x='actual', y='pred', data=eval_df,
            scatter_kws={"color": scatter_color, "alpha": 0.7},
            line_kws={"color": line_color, "linewidth": 3})

plt.ylim(bottom=0)
plt.title('Actual vs. Predicted Values', fontsize=16)
plt.xlabel('Actual', fontsize=14)
plt.ylabel('Predictions', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()