## Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [29]:
data = {
    "Area": [1200, 1500, np.nan, 1800, 1000, 2000, np.nan],
    "Bedrooms": [2, 3, 2, 4, np.nan, 3, 1],
    "City": ["Mumbai", "Delhi", "Mumbai", "Bangalore", "Delhi", "Mumbai", "Bangalore"],
    "Education": ["Bachelor", "Master", "Bachelor", "PhD", "Master", "Bachelor", np.nan],
    "Age_of_House": [10, 5, 15, 2, 20, 7, 12],
    "Price": [5000000, 7000000, 4800000, 9000000, 4500000, 8000000, 6000000]
}

In [30]:
df = pd.DataFrame(data)

In [31]:
df

Unnamed: 0,Area,Bedrooms,City,Education,Age_of_House,Price
0,1200.0,2.0,Mumbai,Bachelor,10,5000000
1,1500.0,3.0,Delhi,Master,5,7000000
2,,2.0,Mumbai,Bachelor,15,4800000
3,1800.0,4.0,Bangalore,PhD,2,9000000
4,1000.0,,Delhi,Master,20,4500000
5,2000.0,3.0,Mumbai,Bachelor,7,8000000
6,,1.0,Bangalore,,12,6000000


## PART A — Python & Pandas
### Task A1 — Data Understanding
    Print shape
    Print column names
    Check data types
    Identify missing values per column

In [32]:
df.info(), df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Area          5 non-null      float64
 1   Bedrooms      6 non-null      float64
 2   City          7 non-null      object 
 3   Education     6 non-null      object 
 4   Age_of_House  7 non-null      int64  
 5   Price         7 non-null      int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 464.0+ bytes


(None, (7, 6))

In [33]:
df.columns

Index(['Area', 'Bedrooms', 'City', 'Education', 'Age_of_House', 'Price'], dtype='object')

### Task A2 — Handle Missing Values
    Fill numerical columns using median
    Fill categorical columns using mode

In [34]:
df.isna().sum()

Area            2
Bedrooms        1
City            0
Education       1
Age_of_House    0
Price           0
dtype: int64

In [37]:
df['Area'].fillna(df["Area"].median(), inplace =True)
df['Bedrooms'].fillna(df['Bedrooms'].median(), inplace = True)
df['Education'].fillna(df['Education'].mode()[0], inplace=True)

### Task A3 — Verify Cleaning
    Ensure no missing values remain
    Print summary statistics (describe())

In [39]:
df.isna().sum()

Area            0
Bedrooms        0
City            0
Education       0
Age_of_House    0
Price           0
dtype: int64

In [40]:
df.describe()

Unnamed: 0,Area,Bedrooms,Age_of_House,Price
count,7.0,7.0,7.0,7.0
mean,1500.0,2.5,10.142857,6328571.0
std,336.650165,0.957427,6.148945,1728886.0
min,1000.0,1.0,2.0,4500000.0
25%,1350.0,2.0,6.0,4900000.0
50%,1500.0,2.5,10.0,6000000.0
75%,1650.0,3.0,13.5,7500000.0
max,2000.0,4.0,20.0,9000000.0


## PART B — Feature Engineering & Encoding
### Task B1 — Separate Features & Target
    Create X and y
    Ensure Price is NOT in X

In [41]:
X = df.drop("Price", axis =1).values
y = df["Price"].values
X.shape, y.shape

((7, 5), (7,))

### Task B2 — Encode Categorical Variables
Apply:

    OneHotEncoding to City
    Ordinal encoding to Education
    (Bachelor < Master < PhD)

In [42]:
ed_enc = {
    'Bachelor' :1,
    'Master' : 2,
    "PhD" : 3
}

In [45]:
df["Education"] = df["Education"].map(ed_enc)
df

Unnamed: 0,Area,Bedrooms,City,Education,Age_of_House,Price
0,1200.0,2.0,Mumbai,1,10,5000000
1,1500.0,3.0,Delhi,2,5,7000000
2,1500.0,2.0,Mumbai,1,15,4800000
3,1800.0,4.0,Bangalore,3,2,9000000
4,1000.0,2.5,Delhi,2,20,4500000
5,2000.0,3.0,Mumbai,1,7,8000000
6,1500.0,1.0,Bangalore,1,12,6000000


In [49]:
df_encoded = pd.get_dummies(df, columns=["City"])
df_encoded

Unnamed: 0,Area,Bedrooms,Education,Age_of_House,Price,City_Bangalore,City_Delhi,City_Mumbai
0,1200.0,2.0,1,10,5000000,0,0,1
1,1500.0,3.0,2,5,7000000,0,1,0
2,1500.0,2.0,1,15,4800000,0,0,1
3,1800.0,4.0,3,2,9000000,1,0,0
4,1000.0,2.5,2,20,4500000,0,1,0
5,2000.0,3.0,1,7,8000000,0,0,1
6,1500.0,1.0,1,12,6000000,1,0,0


## PART C — Train/Test Split & Leakage Control
### Task C1 — Split Data
    80% train, 20% test
    Use random_state=42

In [50]:
X = df.drop("Price", axis =1)
y = df["Price"]

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [53]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5, 5), (2, 5), (5,), (2,))

### Task C2 — Scaling (CRITICAL)
    Identify numerical columns
    Apply StandardScaler
    Fit ONLY on training data
    Transform both train & test

In [57]:
from sklearn.preprocessing import StandardScaler

In [61]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [62]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]= scaler.transform(X_test[num_cols])

## PART D — Final Verification
### Task D1 — Shape Checks
* Print shapes of:

       X_train
        X_test
        y_train
        y_test

In [65]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5, 5)
(2, 5)
(5,)
(2,)


### Task D2 — Concept Check (Write Answers)

Answer in 1–2 lines each:

Why didn’t we scale before splitting?
-> Splitting the data after scaling would lead to data leakage. Therefore we should scale after splitting. Train and test should be scaled individually.
-->> We don’t scale before splitting to avoid data leakage, because scaling learns statistics from the data. We split first, fit the scaler only on the training set, and apply it to the test set.

Why didn’t we encode after converting to NumPy?
-->>We perform encoding in Pandas rather than NumPy because Pandas preserves column names and data types, which is essential for safe categorical encoding

Which models would NOT need scaling here?
-> DecisionTree, RandomForest such models do not require Scaling
-->>Tree-based models like Decision Trees and Random Forests do not require scaling since they are not distance- or gradient-based.