In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
import pickle, json

matplotlib.rcParams["figure.figsize"] = (15,8)




In [45]:
df = pd.read_csv("Housing.csv")
print("Dataset Loaded Successfully ")


Dataset Loaded Successfully 


In [15]:
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [22]:
# ---------------------- BASIC INFO ----------------------
print("\nColumns in dataset:\n", df.columns)
print("\nMissing values:\n", df.isnull().sum())


Columns in dataset:
 Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

Missing values:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [16]:
df.shape

(545, 13)

In [23]:
df = df.replace({'yes': 1, 'no': 0}).infer_objects(copy=False)

  df = df.replace({'yes': 1, 'no': 0}).infer_objects(copy=False)


In [24]:
df = pd.get_dummies(df, drop_first=True)
print("\nData after encoding categorical variables:\n", df.head())


Data after encoding categorical variables:
       price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                            False            

In [25]:
# ---------------------- FEATURE SELECTION ----------------------
X = df.drop(['price'], axis='columns')
y = df['price']

print("\nFeature matrix shape:", X.shape)
print("Target shape:", y.shape)


Feature matrix shape: (545, 13)
Target shape: (545,)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData split done ")


Data split done 


In [27]:
# ---------------------- BASELINE MODEL (Linear Regression) ----------------------
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
print("\nLinear Regression R2 Score:", lr_clf.score(X_test, y_test))


Linear Regression R2 Score: 0.6529242642153185


In [28]:
# ---------------------- CROSS VALIDATION ----------------------
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cv_scores = cross_val_score(LinearRegression(), X, y, cv=cv)
print("\nCross Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))


Cross Validation Scores: [0.66112143 0.63270309 0.71526907 0.57752042 0.71074636]
Mean CV Score: 0.6594720734474557


In [42]:

# ---------------------- GRID SEARCH FOR BEST MODEL ----------------------
def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [0.1, 1, 2, 5],
                'selection': ['cyclic', 'random']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['friedman_mse', 'squared_error'],
                'splitter': ['best', 'random'],
                'max_depth': [None, 5, 10]
            }
        }
    }
    scores = []
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=5, scoring='r2', return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

model_results = find_best_model_using_gridsearchcv(X, y)
print("\nBest Model Results:\n", model_results)


Best Model Results:
                model  best_score  \
0  linear_regression   -9.670540   
1              lasso   -9.911801   
2      decision_tree  -17.391768   

                                         best_params  
0                           {'fit_intercept': False}  
1                {'alpha': 5, 'selection': 'cyclic'}  
2  {'criterion': 'friedman_mse', 'max_depth': 5, ...  


In [31]:
final_model = LinearRegression()
final_model.fit(X, y)

In [32]:
# ---------------------- SAVE MODEL ----------------------
with open('housing_price_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

columns = {'data_columns': [col.lower() for col in X.columns]}
with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))

print("\nModel and Columns JSON saved successfully")


Model and Columns JSON saved successfully


In [38]:
# ---------------------- PREDICTION FUNCTION ----------------------
def predict_price(area, bedrooms, bathrooms, stories, parking, mainroad, guestroom,
                  basement, hotwaterheating, airconditioning, prefarea, furnishingstatus):

    # Prepare input row same as training columns
    x = np.zeros(len(X.columns))

    # Continuous features
    x[X.columns.get_loc('area')] = area
    x[X.columns.get_loc('bedrooms')] = bedrooms
    x[X.columns.get_loc('bathrooms')] = bathrooms
    x[X.columns.get_loc('stories')] = stories
    x[X.columns.get_loc('parking')] = parking

    # Binary (yes/no) encoded features
    if 'mainroad_yes' in X.columns:
        x[X.columns.get_loc('mainroad_yes')] = 1 if mainroad == 'yes' else 0
    if 'guestroom_yes' in X.columns:
        x[X.columns.get_loc('guestroom_yes')] = 1 if guestroom == 'yes' else 0
    if 'basement_yes' in X.columns:
        x[X.columns.get_loc('basement_yes')] = 1 if basement == 'yes' else 0
    if 'hotwaterheating_yes' in X.columns:
        x[X.columns.get_loc('hotwaterheating_yes')] = 1 if hotwaterheating == 'yes' else 0
    if 'airconditioning_yes' in X.columns:
        x[X.columns.get_loc('airconditioning_yes')] = 1 if airconditioning == 'yes' else 0
    if 'prefarea_yes' in X.columns:
        x[X.columns.get_loc('prefarea_yes')] = 1 if prefarea == 'yes' else 0

    # Furnishing status (multi-category one-hot)
    if furnishingstatus.lower() == 'semi-furnished' and 'furnishingstatus_semi-furnished' in X.columns:
        x[X.columns.get_loc('furnishingstatus_semi-furnished')] = 1
    elif furnishingstatus.lower() == 'unfurnished' and 'furnishingstatus_unfurnished' in X.columns:
        x[X.columns.get_loc('furnishingstatus_unfurnished')] = 1


    x_df = pd.DataFrame([x], columns=X.columns)
    return final_model.predict(x_df)[0]

In [46]:
# ---------------------- TEST PREDICTIONS ----------------------
print("\nSample Predictions:")
print("House 1 Price:", predict_price(7500, 4, 2, 2, 3, 'yes', 'no', 'no', 'no', 'yes', 'yes', 'furnished'))
print("House 2 Price:", predict_price(8960, 4, 4, 4, 3, 'yes', 'no', 'no', 'no', 'yes', 'no', 'furnished'))



Sample Predictions:
House 1 Price: 6041320.856011595
House 2 Price: 9274796.58025849
