In [1]:
# Import required libraries for data manipulation and machine learning
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
from sklearn import linear_model  # For implementing linear regression
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets

In [2]:
# Load the Boston Housing dataset from CSV file
# This dataset contains information about various features of houses in Boston
# including crime rate, average number of rooms, property tax rate, etc.
# The target variable is the median value of owner-occupied homes in $1000s
df=pd.read_csv("boston_housing.csv")

In [None]:
# View the dimensions of the dataset using df.shape
# Returns a tuple containing (number of rows, number of columns)
# In this case, shows 506 samples with 14 features including the target variable
df.shape

In [None]:
# Generate comprehensive statistical summary of the dataset including:
# - count: number of non-null values for each column
# - mean: average value of each feature
# - std: standard deviation showing spread of values
# - min: minimum value in each column
# - 25th percentile: value below which 25% of data falls
# - 50th percentile (median): middle value of data
# - 75th percentile: value below which 75% of data falls
# - max: maximum value in each column
df.describe()

In [None]:
# Display the first 5 rows of the dataset to examine features and their values
# This helps understand the structure and content of the data before analysis
# Shows initial data quality and potential patterns in the features
df.head()

In [None]:
# Assign feature variable to df_x by selecting all columns except 'price'
# This creates our feature matrix X containing all independent variables
# We exclude the target variable 'price' to avoid data leakage
df_x = df.drop('price', axis=1)
df_x=df
print(df_x)

In [None]:
# Assign the target variable 'price' to df_y
# This creates our target vector y containing the median house prices
# The target variable represents the dependent variable we want to predict
df_y=df.price
print(df_y)

In [8]:
# Linear regression is a statistical method that models the relationship between a dependent variable (target) and one or more independent variables (features)
# by fitting a linear equation to observed data. The model assumes a linear relationship between the input variables and the target variable.
# In this case, we're using scikit-learn's implementation to predict house prices based on various housing features.
# This creates a new instance of LinearRegression class which will:
# - Find the best-fit line through the data points
# - Minimize the sum of squared errors between predictions and actual values
# - Calculate coefficients (weights) for each feature
# - Handle multiple features automatically through matrix operations
reg =linear_model.LinearRegression()

In [9]:
# Split the data into training and testing sets:
# - 67% of data for training (x_train, y_train)
# - 33% of data for testing (x_test, y_test)
# - random_state=42 ensures reproducibility of the split
# - This split helps evaluate model performance on unseen data
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42)

In [None]:
# 1--Train our model with the training data
# This step fits the linear regression model to our training data by:
# - Computing the optimal coefficients for each feature
# - Finding the best-fit line that minimizes the sum of squared errors
# - Learning the relationship between features and target variable
# - Using the training data to establish the model's parameters
reg.fit(x_train, y_train)

In [None]:
# Print the coefficients/weights for each feature/column of our model
# These coefficients represent:
# - The importance/impact of each feature on the target variable (price)
# - The change in predicted price for a one-unit increase in each feature
# - The learned parameters that minimize the sum of squared errors
# - The weights that define the linear relationship between features and price
print(reg.coef_)

In [None]:
# Generate and print price predictions for the test dataset
# This step:
# - Uses the trained model (reg) to predict house prices
# - Applies the learned coefficients to the test features (x_test)
# - Produces predicted prices for each house in the test set
# - These predictions can be compared with actual prices (y_test) to evaluate model accuracy
y_pred = reg.predict(x_test)
print(y_pred)

In [None]:
# Print the predicted house price for the first house in the test dataset
# This shows the model's prediction based on the learned coefficients
# The prediction is generated using the features of the first test house
y_pred[0]

In [None]:
# Print the actual house price for the first house in the test dataset
# This shows the true price value that we can compare with our prediction
# This comparison helps evaluate how well our model's prediction (y_pred[0]) matches reality
y_test[0]

In [None]:
# Calculate the Mean Squared Error (MSE) to evaluate model performance
# MSE measures the average squared difference between predicted and actual values
# - Lower MSE indicates better model performance
# - Squaring the differences penalizes larger errors more heavily
# - This metric helps quantify how well our model's predictions match the actual prices
print(np.mean((y_pred-y_test)**2))