In [24]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load the dataset and perform initial exploration
df = pd.read_csv("../CSV/Housing.csv")  
print("Dataset Summary:")
print(df.describe())  
print("\nMissing Values Check:")
print(df.isnull().sum())  
df = df.dropna()  

In [None]:
# Visualize distributions of numerical variables using histograms
numerical_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
df[numerical_cols].hist(figsize=(10, 8), bins=30)
plt.suptitle('Distribution of Numerical Variables', y=1.02)
plt.savefig('Distribution of Numerical Variables.png', dpi=800, bbox_inches='tight')
plt.show()

In [None]:
# Detect outliers using boxplots for numerical columns
plt.figure(figsize=(12, 6))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.suptitle('Outlier Detection with Boxplots', y=1.02)
plt.savefig('Outlier Detection.png', dpi=800, bbox_inches='tight')
plt.show()

In [28]:
# Preprocess binary categorical features (yes/no to 1/0)
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0}) 

In [None]:
# One-Hot Encode non-ordinal categorical feature (furnishingstatus)
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)  # Avoid dummy variable trap

# Normalize numerical features to [0,1] range using Min-Max Scaling
scaler = MinMaxScaler()
scaled_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])  # Apply scaling to specified columns

# Analyze feature correlations with the target variable (price)
corr_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.savefig('Correlation Matrix.png', dpi=800, bbox_inches='tight')
plt.show()

In [30]:
# Split data into features (X) and target variable (y)
X = df.drop(columns=['price'])  # Features (all columns except price)
y = df['price']  # Target variable

# Split data into training (80%) and testing (20%) sets with reproducible random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)  # Train model on training data

# Generate predictions on the test set
y_pred = model.predict(X_test)

In [32]:
# Evaluate model performance using RMSE and R² Score
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'\nModel Performance:')
print(f'RMSE: {rmse:.2f}')  # Lower RMSE indicates better accuracy
print(f'R² Score: {r2:.4f}')  # Higher R² (closer to 1) indicates better fit


Model Performance:
RMSE: 1324506.96
R² Score: 0.6529
