In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
df = pd.read_csv("house_prices.csv")
print("Original Data Sample")
print(df.head())

print("\nMissing values before handling:")
print(df.isnull().sum())

df['SquareFeet'] = df['SquareFeet'].fillna(df['SquareFeet'].mean())
df['Bedrooms'] = df['Bedrooms'].fillna(df['Bedrooms'].mean())
df['Bathrooms'] = df['Bathrooms'].fillna(df['Bathrooms'].mean())
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Neighborhood'] = df['Neighborhood'].fillna(df['Neighborhood'].mode()[0])

le = LabelEncoder()
df['Neighborhood'] = le.fit_transform(df['Neighborhood'])

print("\nFinal Preprocessed Data:")
print(df.head())


Original Data Sample
   SquareFeet  Bedrooms  Bathrooms   Age Neighborhood   Price
0      3974.0       4.0          1  32.0            B  625050
1      1660.0       5.0          1  47.0            C  268217
2      2094.0       1.0          3  11.0            B  381043
3      1930.0       5.0          1  21.0            A  356021
4      1895.0       5.0          2   NaN            B  315302

Missing values before handling:
SquareFeet      1
Bedrooms        1
Bathrooms       0
Age             1
Neighborhood    2
Price           0
dtype: int64

Final Preprocessed Data:
   SquareFeet  Bedrooms  Bathrooms        Age  Neighborhood   Price
0      3974.0       4.0          1  32.000000             1  625050
1      1660.0       5.0          1  47.000000             2  268217
2      2094.0       1.0          3  11.000000             1  381043
3      1930.0       5.0          1  21.000000             0  356021
4      1895.0       5.0          2  27.444444             1  315302


In [None]:
# relevant features that most likely impact the price.
correlation = df.corr()
print("\nFeature Correlation with Price:")
print(correlation['Price'].sort_values(ascending=False))


Feature Correlation with Price:
Price           1.000000
SquareFeet      0.980033
Bathrooms       0.109665
Neighborhood    0.102283
Bedrooms       -0.018407
Age            -0.147243
Name: Price, dtype: float64


In [None]:
# Evaluate the performance of the model
features = ['SquareFeet', 'Bedrooms', 'Bathrooms', 'Age', 'Neighborhood']
X = df[features]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"\nModel Evaluation:")
print(f"R2 Score: {r2:.2f}")
print(f"Mean Squared Error: {mse:.2f}")


Model Evaluation:
R2 Score: 0.98
Mean Squared Error: 484894778.40


In [None]:
# Predict the price of a house given a new set of features.
# Example: 2200 sqft, 3 bedrooms, 2 bathrooms, 5 years old, neighborhood = 'A'

new_data = pd.DataFrame([{
    'SquareFeet': 2200,
    'Bedrooms': 3,
    'Bathrooms': 2,
    'Age': 5,
    'Neighborhood': le.transform(['A'])[0]  # Encode same as training
}])

predicted_price = model.predict(new_data)[0]
print(f"\nPredicted Price for New House: ${predicted_price:,.2f}")


Predicted Price for New House: $401,257.00
