In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Housing.csv")

In [3]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,mainroad,guestroom,basement,airconditioning,parking,furnishingstatus
0,13300000,7420,4,2,yes,no,no,yes,2,furnished
1,12250000,8960,4,4,yes,no,no,yes,3,furnished
2,12250000,9960,3,2,yes,no,yes,no,2,semi-furnished
3,12215000,7500,4,2,yes,no,yes,yes,3,furnished
4,11410000,7420,4,1,yes,yes,yes,yes,2,furnished


In [4]:
df['mainroad'].value_counts() # Tells how many houses are on a main road vs not.

mainroad
yes    468
no      77
Name: count, dtype: int64

In [5]:
df['furnishingstatus'].value_counts() # Shows how many houses are furnished, semi-furnished, or unfurnished.

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

# one hot encoding
    it is used to convert the categorial value into the numerical value

In [6]:
pd.get_dummies(df, columns=['mainroad', 'guestroom', 'basement', 'airconditioning', 'furnishingstatus'],dummy_na=False,sparse=False)

Unnamed: 0,price,area,bedrooms,bathrooms,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,airconditioning_no,airconditioning_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,2,False,True,True,False,True,False,False,True,True,False,False
1,12250000,8960,4,4,3,False,True,True,False,True,False,False,True,True,False,False
2,12250000,9960,3,2,2,False,True,True,False,False,True,True,False,False,True,False
3,12215000,7500,4,2,3,False,True,True,False,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,False,True,False,True,False,True,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,2,False,True,True,False,False,True,True,False,False,False,True
541,1767150,2400,3,1,0,True,False,True,False,True,False,True,False,False,True,False
542,1750000,3620,2,1,0,False,True,True,False,True,False,True,False,False,False,True
543,1750000,2910,3,1,0,True,False,True,False,True,False,True,False,True,False,False


In [7]:
# Multicollinearity in regression occurs when independent variables are highly correlated with each other, 
# making it difficult to isolate the individual effect of each on the dependent variable. 
# This can lead to unreliable regression results and inaccurate interpretations. 

In [8]:
x = df.drop("price", axis=1)
y = df['price']

In [9]:
from sklearn.model_selection import train_test_split
x_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
x_train

Unnamed: 0,area,bedrooms,bathrooms,mainroad,guestroom,basement,airconditioning,parking,furnishingstatus
46,6000,3,2,yes,no,no,yes,1,furnished
93,7200,3,2,yes,no,yes,yes,3,semi-furnished
335,3816,2,1,yes,no,yes,yes,2,furnished
412,2610,3,1,yes,no,yes,no,0,unfurnished
471,3750,3,1,yes,no,no,no,0,unfurnished
...,...,...,...,...,...,...,...,...,...
71,6000,4,2,yes,no,no,yes,0,unfurnished
106,5450,4,2,yes,no,yes,yes,0,semi-furnished
270,4500,3,2,yes,no,no,no,1,furnished
435,4040,2,1,yes,no,no,no,0,unfurnished


In [11]:
X_test

Unnamed: 0,area,bedrooms,bathrooms,mainroad,guestroom,basement,airconditioning,parking,furnishingstatus
316,5900,4,2,no,no,yes,no,1,unfurnished
77,6500,3,2,yes,no,no,yes,0,furnished
360,4040,2,1,yes,no,no,no,0,semi-furnished
90,5000,3,1,yes,no,no,yes,0,semi-furnished
493,3960,3,1,yes,no,no,no,0,furnished
...,...,...,...,...,...,...,...,...,...
15,6000,4,1,yes,no,yes,no,2,semi-furnished
357,6930,4,1,no,no,no,no,1,furnished
39,6000,4,2,yes,no,no,yes,1,semi-furnished
54,6000,3,2,yes,yes,no,yes,1,semi-furnished


In [12]:
y_train

46     7525000
93     6300000
335    3920000
412    3430000
471    3010000
        ...   
71     6755000
106    6160000
270    4340000
435    3290000
102    6195000
Name: price, Length: 436, dtype: int64

In [13]:
y_test

316    4060000
77     6650000
360    3710000
90     6440000
493    2800000
        ...   
15     9100000
357    3773000
39     7910000
54     7350000
155    5530000
Name: price, Length: 109, dtype: int64

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
one = OneHotEncoder(sparse_output=False, drop= 'first')

In [16]:
x_train_new = one.fit_transform(x_train[['mainroad', 'guestroom', 'basement', 'airconditioning', 'parking', 'furnishingstatus']])
x_train_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 1., 0., ..., 0., 1., 0.]], shape=(436, 9))

In [17]:
# useless line. does nothing. written by mistake in simple words.
# but doesn't effect the programe cause we later build an pipeline to replace this manual step
# but this step was intended to combine the new encoded data with the numeric old data. so,
# i'll just write the correct code in the next column
new_df = np.hstack(x_train[['area','bedrooms', 'bathrooms', 'parking']])
new_df

array(['area', 'bedrooms', 'bathrooms', 'parking'], dtype='<U9')

In [18]:
x_train.dtypes

area                 int64
bedrooms             int64
bathrooms            int64
mainroad            object
guestroom           object
basement            object
airconditioning     object
parking              int64
furnishingstatus    object
dtype: object

In [19]:
# This step manually combines numerical + encoded features (usually replaced by ColumnTransformer in a pipeline)
# np.hstack([...]) merges them column-wise into a final 2D array
new_df = np.hstack([
    x_train[['area', 'bedrooms', 'bathrooms', 'parking']].values, # x_train[['...']] selects the numerical columns|
    #.values converts it into a NumPy array (2D)
    x_train_new # x_train_new is the one-hot encoded NumPy array
])
new_df

array([[6.000e+03, 3.000e+00, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [7.200e+03, 3.000e+00, 2.000e+00, ..., 1.000e+00, 1.000e+00,
        0.000e+00],
       [3.816e+03, 2.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [4.500e+03, 3.000e+00, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.040e+03, 2.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [5.500e+03, 3.000e+00, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]], shape=(436, 13))

In [20]:
new_df.dtype

dtype('float64')

In [21]:
from sklearn.compose import ColumnTransformer

In [22]:
# Identify categorical columns
cate_cols = x.select_dtypes(include='object').columns.tolist()

In [23]:
cate_cols

['mainroad', 'guestroom', 'basement', 'airconditioning', 'furnishingstatus']

In [24]:
# Define preprocessing step
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(drop='first'),cate_cols)
    ],
    remainder='passthrough',
    sparse_threshold=0.3
)
# remainder='passthrough'  # keeps the numeric columns as-is
# sparse_threshold=0.3 allows conversion to dense array if <30% non-zeros

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [26]:
# Build pipeline
# Pipeline combines preprocessing + model training in one object
# 'preprocessor' does encoding + keeps numeric columns
# 'regression' step trains LinearRegression model on transformed features

pl = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', LinearRegression())
])

In [27]:
# so this line here was absolutely useless. cause this has been done before and this just does that again
# ⚠️ Note: This re-splits the original unencoded data again. 
# It's unnecessary since we already did train_test_split before.
# This line is safe only because the pipeline handles encoding.

from sklearn.model_selection import train_test_split
x_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [28]:
# Fit pipeline
pl.fit(x_train, y_train)

''' 
yeah! your absolutely right! we just passed the un-encoded data.
but we passed it inside the pipeline.so the pipe line has the processor.
the processor has our enoding i.e. one hot encoding and also all the code.
hence, the encoding and all was to be done after the .fit method passed.
'''

'''
Yes! We're passing unencoded data into the pipeline,
but that's totally fine — because the pipeline internally applies all preprocessing.

Step-by-step:
- 'preprocessor' applies OneHotEncoding to categorical columns
- numeric columns are passed through unchanged
- then LinearRegression is trained on the fully transformed data

So manual encoding was unnecessary — the pipeline takes care of everything!
'''


"\nYes! We're passing unencoded data into the pipeline,\nbut that's totally fine — because the pipeline internally applies all preprocessing.\n\nStep-by-step:\n- 'preprocessor' applies OneHotEncoding to categorical columns\n- numeric columns are passed through unchanged\n- then LinearRegression is trained on the fully transformed data\n\nSo manual encoding was unnecessary — the pipeline takes care of everything!\n"