Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
house_price_dataset = sklearn.datasets.fetch_california_housing()

In [None]:
print(house_price_dataset)

In [None]:
#Loading to a pandas dataframe
house_price_dataframe = pd.DataFrame(house_price_dataset.data, columns= house_price_dataset.feature_names)
house_price_dataframe.head()

In [None]:
house_price_dataframe['MedVal'] = house_price_dataset.target

In [None]:
house_price_dataframe.head()

In [None]:
house_price_dataframe.shape

In [None]:
#check for missing values
house_price_dataframe.isnull().sum()

In [None]:
house_price_dataframe.describe()

Understanding the correlation between various features in the dataset

In [None]:
correlation = house_price_dataframe.corr()

In [None]:
#constructing a heat map to understand the correlation
plt.figure(figsize=(10,10))
sns.heatmap(correlation, cbar= True, square= True, fmt= '.1f', annot= True, annot_kws= {'size':8}, cmap='Blues')

In [None]:
 #splitting the data 
x = house_price_dataframe.drop(['MedVal'], axis=1)
y = house_price_dataframe['MedVal']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 2)

Model Training and Evaluation

XGBoost Regressor

In [None]:
# loading the model
model = XGBRegressor()
model.fit(x_train, y_train)

In [None]:
#Checking the accuracy on the training data
training_data_prediction = model.predict(x_train)

#R-Squared Error
score_1 = metrics.r2_score(y_train, training_data_prediction)

#Mean Absolute Error
score_2 = metrics.mean_absolute_error(y_train, training_data_prediction)

print('R squared error: ', score_1)
print('Mean squared error: ', score_2)

In [None]:
#Visualiing the actual prices and predicted pries
plt.scatter(y_train, training_data_prediction)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual Price vs Predicted Price')
plt.show()

In [None]:
#Checking the accuracy on the training data
test_data_prediction = model.predict(x_test)

#R-Squared Error
score_1 = metrics.r2_score(y_test, test_data_prediction)

#Mean Absolute Error
score_2 = metrics.mean_absolute_error(y_test, test_data_prediction)


print('R squared error: ', score_1)
print('Mean squared error: ', score_2)