<a href="https://colab.research.google.com/github/armukilan/ml_examples/blob/main/Boston_house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
boston_house_prices_path = kagglehub.dataset_download('vikrishnan/boston-house-prices')

print('Data source import complete.')

In [None]:
# Importing the libraries
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Method 1: Using the direct URL (your approach - fixed)
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# Correctly reshape the data
x = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

# Define feature names (same as sklearn's Boston dataset)
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
                'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

# Create DataFrame (matching your original structure)
data = pd.DataFrame(x, columns=feature_names)
data["SalePrice"] = y

# Now you can use data.head()
print(data.head())
print(f"\nDataset shape: {data.shape}")
print(f"Feature columns: {list(data.columns[:-1])}")
print(f"Target column: {data.columns[-1]}")

In [None]:
print("Boston Housing Dataset Information:")
print("="*40)
print(f"Dataset shape: {data.shape}")
print(f"Features: {data.shape[1] - 1}")
print(f"Target: SalePrice")
print("\nFeature Names:")
print(list(data.columns[:-1]))
print(f"\nFirst few rows:")
print(data.head())

In [None]:
data.info()

In [None]:
data.describe()

EDA

In [None]:
data.isnull().sum()

In [None]:
sns.pairplot(data, height=2.5)
plt.tight_layout()

In [None]:
sns.displot(data['SalePrice'])

In [None]:
print("Skewness: %f" % data['SalePrice'].skew())
print("Kurtosis: %f" % data['SalePrice'].kurt())

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = data['CRIM'], y = data['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('CRIM', fontsize=13)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = data['AGE'], y = data['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('CRIM', fontsize=13)
plt.show()

In [None]:
from scipy import stats
from scipy.stats import norm, skew
sns.distplot(data['SalePrice'] , fit=norm)
(mu, sigma) = norm.fit(data['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
fig = plt.figure()
res = stats.probplot(data['SalePrice'], plot=plt)
plt.show()

Data Correlation

In [None]:
plt.figure(figsize=(10,10))
correlation = data.corr()
sns.heatmap(correlation, annot=True, linewidths=.5, cmap=plt.cm.PuBu)
plt.show()

Model Building

In [None]:
from sklearn.model_selection import train_test_split
# Imports the train_test_split function, which is used to split your dataset into training and testing subsets.

X = data.drop("SalePrice", axis=1)
# Creates a new DataFrame X that contains all columns from data except the "SalePrice" column. This represents the features or independent variables.

y = data["SalePrice"]
# Extracts the "SalePrice" column from data as the target variable y. This is the dependent variable (what you want to predict).

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# Splits the dataset into: - X_train, y_train: 80% of the data for training. X_test, y_test: 20% of the data for testing.
# test_size=0.2: Specifies the test set should be 20% of the total.
# random_state=42: Ensures reproducibility. Every time you run it, you get the same split.

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

predictions = lr.predict(X_test)

print("Actual value of the house:- ", y_test[0])
print("Model Predicted Value:- ", predictions[0])

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(rmse)