In [9]:
# Step 1: Required libraries import karo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: CSV data load karo
df = pd.read_csv("house_prices.csv")
print(df.head())  # Pehle 5 rows dekhne ke liye

# Step 3: Features aur target ko define karo
X = df[["Rooms", "Area", "Age", "Location"]]  # Independent variables
y = df["Price"]                               # Dependent variable (Target)

# Step 4: Categorical column ("Location") ko encode karna hoga
categorical_cols = ["Location"]
numeric_cols = ["Rooms", "Area", "Age"]

# Step 5: ColumnTransformer banayein jisme OneHotEncoder lagega
preprocessor = ColumnTransformer(
    transformers=[
        ("loc_encoder", OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'  # Baaki numeric columns waise ke waise rahenge
)

# Step 6: Pipeline create karo (encoding + model)
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regression_model", LinearRegression())
])

"""
Machine learning models (jaise LinearRegression) sirf numbers pe kaam karte hain — wo text (strings) ko samajh nahi sakte.

✅ Problem:
Agar tum direct 'Location' = Lahore dene ki koshish karo, to model bolega:

“Mujhe yeh samajh nahi aata! Lahore number to nahi hai!”

1️⃣ ColumnTransformer
Iska kaam hai:

“Specific columns ko specific processing do.”

Jaise: Location column ko encode karo, baaki columns waise hi chhor do.

2️⃣ Pipeline
Ek pipeline tumhara poora workflow combine karta hai:

Step 1: Encode categorical data

Step 2: Train regression model

Tumhe alag alag fit_transform() ya fit() nahi chalana padta.

"""

# Step 7: Train/Test data split karo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Model ko train karo pipeline ke zariye
pipeline.fit(X_train, y_train)

# Step 9: Prediction karo test data par
y_pred = pipeline.predict(X_test)

# Step 10: Evaluate karo model ki performance
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R2 Score :", r2_score(y_test, y_pred))


   Rooms  Area  Age  Location   Price
0      3  1200   10     Urban  200000
1      4  1500    5     Urban  250000
2      2   800   20     Rural  120000
3      5  2000    2     Urban  320000
4      3  1100   15  Suburban  180000
Mean Squared Error (MSE): 80001732.93070479
R2 Score : 0.9506838632618944
