In [2]:

!pip install pandas numpy scikit-learn matplotlib seaborn jupyter




In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load the California housing dataset
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame

# Optional: Visualize raw distributions
# df.hist(bins=30, figsize=(12, 8))
# plt.tight_layout()
# plt.show()

# ----------------------------
# STEP 1: OUTLIER REMOVAL (IQR)
# ----------------------------
Q1 = df['MedHouseVal'].quantile(0.25)
Q3 = df['MedHouseVal'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['MedHouseVal'] >= lower_bound) & (df['MedHouseVal'] <= upper_bound)]

# ----------------------------
# STEP 2: SCALING INPUT FEATURES
# ----------------------------
# Select features to scale (exclude target column)
features_to_scale = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_scaled[features_to_scale])

# ----------------------------
# STEP 3: TRAIN/TEST SPLIT
# ----------------------------
X = df_scaled.drop('MedHouseVal', axis=1)
y = df_scaled['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# STEP 4: TRAIN THE MODEL
# ----------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# ----------------------------
# STEP 5: EVALUATE
# ----------------------------
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Mean Squared Error: {mse:.4f}")
print(f"📈 R² Score: {r2:.4f}")

# ----------------------------
# STEP 6: PREDICT CUSTOM INPUT
# ----------------------------
# Custom input sample (use realistic values)
sample_dict = {
    'MedInc': 3.5,
    'HouseAge': 25,
    'AveRooms': 5.2,
    'AveBedrms': 1.1,
    'Population': 1200,
    'AveOccup': 2.7,
    'Latitude': 34.2,
    'Longitude': -118.5
}

# Convert to DataFrame and match training columns
sample = pd.DataFrame([sample_dict])
sample = sample[X_train.columns]  # Ensure same column order

# Scale input features
sample_scaled = sample.copy()
sample_scaled[features_to_scale] = scaler.transform(sample[features_to_scale])

# Predict
predicted_value = model.predict(sample_scaled)[0]
print(f"🏡 Predicted median house value: ${predicted_value * 100000:.2f}")


📊 Mean Squared Error: 0.3688
📈 R² Score: 0.6013
🏡 Predicted median house value: $196400.18
