In [None]:
#import libraires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [None]:
#read csv file
df=pd.read_csv('housing.csv')

In [None]:
#Check dataset
print("The Shape:",df.shape)
print("The NULL values:\n",df.isnull().sum())
print("Number of duplicated values",df.duplicated().sum())

In [None]:
#identify rows with any missing values
df[df.isnull().any(axis=1)]

In [None]:
#fill the total_bedrooms columns
df.total_bedrooms.fillna(df.total_bedrooms.mean().round(2),inplace=True)

In [None]:
#list of columns in the dataset
df.columns.to_list()

In [None]:
#descriptive statistics
df.describe()

In [None]:
#summary of a DataFrame
df.info()

In [None]:
#list frist 5 rows
df.head()

In [None]:
#list last 5 rows
df.tail()

In [None]:
#Median income
plt.figure(figsize=(8,10), facecolor='azure')
sns.set_style('whitegrid')
sns.histplot(df['median_income'], color='skyblue')
plt.title("Median Income (scaled to $10000)",fontsize=18)
plt.xlabel("Median")
plt.ylabel("Frequence")
plt.tight_layout()
plt.show()

In [None]:
#Boxplot for detecting outliers
plt.figure(figsize=(12,8),facecolor='azure')
sns.boxplot(x=df['median_house_value'], color='lightblue')
plt.title("Boxplot of Median House Prices",fontsize=18)
plt.xlabel("Median House Prices")
plt.tight_layout()
plt.show()

In [None]:
# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])

In [None]:
# apply Label encoder to df_categorical
enc= preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(enc.fit_transform)
df.drop(df_categorical.columns, axis=1, inplace=True)
df = pd.concat([df, df_categorical], axis=1)

In [None]:
# Define features and target variable
X=df.drop('median_house_value',axis=1)
y=df['median_house_value']

In [None]:
# Split data into training and testing sets (70% train, 30% test)
X_train,X_test,y_train,y_test= train_test_split(X, y ,test_size=.3, random_state=42)

In [None]:
# 1. Linear Regression
linear= LinearRegression()
# Train on training set
linear.fit(X_train,y_train)

In [None]:
# Predict on test set
linear_pred=linear.predict(X_test)

In [None]:
# Calculate performance metrics for Linear Regression
print("Linear Regression Matrices:")
print('MAE:',round(metrics.mean_absolute_error(y_test, linear_pred),3))
print('MSE:',round(metrics.mean_squared_error(y_test, linear_pred),3))
print('RMSE:',round(metrics.root_mean_squared_error(y_test, linear_pred),3))
print('R^2:',round(metrics.r2_score(y_test, linear_pred),3))

In [None]:
#compare the model’s predictions against the actual values
test = pd.DataFrame({'Predicted':linear_pred,'Actual':y_test})
fig= plt.figure(figsize=(16,8), facecolor='azure')
test = test.reset_index()
test = test.drop(['index'],axis=1)
plt.plot(test[:50])
plt.legend(['Predicted','Actual'])

# Show the overall relationship between predicted and actual values
fig= plt.figure(figsize=(8,8), facecolor='azure')
sns.jointplot(x='Predicted',y='Actual',data=test,kind='reg',)
plt.tight_layout()
plt.show()

In [None]:
# 2. Random Forest Regressor
rand= RandomForestRegressor(random_state=42)
# Train on training set
rand.fit(X_train, y_train)

In [None]:
# Predict on test set
rand_pred= rand.predict(X_test)

In [None]:
# Calculate performance metrics for Random Forest Regressor
print("Random Forest Regressor Matrices:")
print('MAE:',round(metrics.mean_absolute_error(y_test, rand_pred),3))
print('MSE:',round(metrics.mean_squared_error(y_test, rand_pred),3))
print('RMSE:',round(metrics.root_mean_squared_error(y_test, rand_pred),3))
print('R^2:',round(metrics.r2_score(y_test, rand_pred),3))

In [None]:
#compare the model’s predictions against the actual values
test = pd.DataFrame({'Predicted':rand_pred,'Actual':y_test})
fig= plt.figure(figsize=(16,8), facecolor='azure')
test = test.reset_index()
test = test.drop(['index'],axis=1)
plt.plot(test[:50])
plt.legend(['Predicted','Actual'])

# Show the overall relationship between predicted and actual values
fig= plt.figure(figsize=(8,8), facecolor='azure')
sns.jointplot(x='Predicted',y='Actual',data=test,kind='reg')
plt.tight_layout()
plt.show()

In [None]:
#visualize correlations between all features
plt.figure(figsize=(12,8),facecolor='azure')
sns.heatmap(data=df.corr().round(2) , annot=True, cmap='coolwarm', cbar=False)
plt.title('Correlation Heatmap Between Numeric Variables',fontsize=18)
plt.tight_layout()
plt.show()