In [3]:
# Import the files module from Google Colab
# This allows us to upload files from our local system
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Load the dataset using pandas
# Import pandas for data manipulation and analysis
import pandas as pd

# Read the uploaded CSV file into a pandas DataFrame
house = pd.read_csv("/content/drive/MyDrive/Housing.csv")

# Display the first 5 rows to verify the data loaded correctly
house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
# Print the shape of the dataset (rows, columns)
print("Shape of dataset:", house.shape)

# Print column names
print("\nColumn names:")
print(house.columns.tolist())

# Display dataset information (data types, null values)
print("\nDataset Info:")
house.info()

# Display statistical summary of numeric columns
print("\nSummary Statistics:")
house.describe()

Shape of dataset: (545, 13)

Column names:
['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


## **Data Preprocessing**

In [5]:
# Check how many missing (null) values are present in each column
# This helps decide whether data cleaning is needed
house.isnull().sum()

Unnamed: 0,0
price,0
area,0
bedrooms,0
bathrooms,0
stories,0
mainroad,0
guestroom,0
basement,0
hotwaterheating,0
airconditioning,0


In [6]:
# Convert categorical variables into numeric form
# get_dummies creates dummy/one-hot encoded variables
house = pd.get_dummies(house, drop_first=True)

In [7]:
# Check the dataset after preprocessing
house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [8]:
# Check data types to ensure all are numeric
house.dtypes

Unnamed: 0,0
price,int64
area,int64
bedrooms,int64
bathrooms,int64
stories,int64
parking,int64
mainroad_yes,bool
guestroom_yes,bool
basement_yes,bool
hotwaterheating_yes,bool


Data preprocessing involved handling missing values and converting categorical variables into numeric form using one-hot encoding.This step ensures the dataset is suitable for regression modeling.


### **Feature Selection & Train–Test Split**

In [9]:
# Separate input features (X) and target variable (y)
# X contains all columns except 'price'
# y contains only the 'price' column
X = house.drop('price', axis=1)
y = house['price']


In [10]:
#Split Dataset into Training and Testing Sets
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# 80% of data is used for training, 20% for testing
# random_state ensures reproducible results
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
#Verify the Split
# Print shapes to verify correct splitting
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (436, 13)
Testing set shape: (109, 13)


The dataset was split into training and testing sets to evaluate the model on unseen data. An 80-20 split was used to ensure sufficient data for training while retaining data for evaluation.


### **Train Linear Regression Model**

In [14]:
# Import Linear Regression model from scikit-learn
from sklearn.linear_model import LinearRegression
# Create a Linear Regression model instance
# Linear Regression is suitable for predicting continuous values like house prices
model = LinearRegression()
# Train the model using training data
# The model learns the relationship between features and house price
model.fit(X_train, y_train)

In [15]:
# Predict house prices for the test dataset
y_pred = model.predict(X_test)

A Linear Regression model was trained using the training dataset to learn the relationship between property features and house prices.The trained model was then used to predict prices on the test dataset.

## **Model Evaluation (MAE & RMSE)**

In [16]:
# Import evaluation metrics for regression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [17]:
# Calculate Mean Absolute Error (MAE)
# MAE measures the average absolute difference between actual and predicted prices
mae = mean_absolute_error(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 970043.4039201636


In [18]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Take square root to get RMSE
rmse = mse ** 0.5

print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 1324506.9600914386


The Linear Regression model was evaluated using MAE and RMSE. The obtained error values indicate reasonable predictive performance,with RMSE being higher than MAE due to penalization of larger errors.