# Train-Validation-Test

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Import Data

In [14]:


np.random.seed(42)

n_samples = 1000

# Numeric features
area = np.random.normal(loc=1200, scale=300, size=n_samples)      # sq ft
rooms = np.random.randint(1, 6, size=n_samples)
age = np.random.randint(0, 50, size=n_samples)



# Base price logic (hidden truth)
price = (
    area * 300 +
    rooms * 50000 -
    age * 1000 +
    np.where(location == 'urban', 200000, 0) +
   
    np.random.normal(0, 50000, size=n_samples)  # noise
)

# Create DataFrame
df = pd.DataFrame({
    'area': area,
    'rooms': rooms,
    'age': age,
    
    'price': price
})

df.head()


Unnamed: 0,area,rooms,age,price
0,1349.014246,4,42,846278.41833
1,1158.52071,1,18,374178.256302
2,1394.306561,3,4,529539.785422
3,1656.908957,5,10,735014.213231
4,1129.753988,3,11,504780.737928


## TASK 1 — Create Train / Validation / Test Split

In [15]:
# Creaete X y
X = df.drop('price', axis=1)
y= df['price']

In [16]:
# Step 1 : split into trainval and test

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X,y, test_size=0.15, random_state=42
)

In [17]:
# Step 2 : Split train into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.176, random_state=42
)

## TASK 2 — Train a Baseline Model

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

In [19]:
train_rmse = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
val_rmse = np.sqrt(mean_squared_error(y_val, model.predict(X_val)))

In [20]:
print('Train RMSE : ', train_rmse)
print('Validation RMSE : ', val_rmse)

Train RMSE :  106854.10549596205
Validation RMSE :  108854.76433238637


## TASK 3 — Single Split Instability Check

In [23]:
rmse_scores=[]

for seed in [0, 1, 2, 3, 4]:
    X_tr, X_v, y_tr, y_v = train_test_split( X_train_val, y_train_val,
                                             test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_tr, y_tr)
    rmse = np.sqrt(mean_squared_error(y_v, model.predict(X_v)))
    rmse_scores.append(rmse)

In [26]:
print("Validation RMSE's : ", rmse_scores)

Validation RMSE's :  [109072.40523536224, 109072.40523536224, 109072.40523536224, 109072.40523536224, 109072.40523536224]


## TASK 4 — k-Fold Cross-Validation (TRAIN ONLY)

In [32]:
from sklearn.model_selection import cross_val_score

cv_scores  = cross_val_score(
    LinearRegression(),
    X_train_val,
    y_train_val,
    scoring = 'neg_root_mean_squared_error',
    cv=5
)
cv_rmse = -cv_scores

In [33]:
print('CV RMSE per fold :', cv_rmse)
print("CV Mean RMSE : ", cv_rmse.mean())
print("Cv Std RMSE", cv_rmse.std())

CV RMSE per fold : [104738.83791708 107630.22479393 111564.97490557 106590.0923437
 108816.28092038]
CV Mean RMSE :  107868.08217613147
Cv Std RMSE 2282.459221560743


## TASK 5 — FINAL Test Evaluation (ONCE)

In [34]:
final_model = LinearRegression()
final_model.fit(X_train_val, y_train_val)

test_rmse = np.sqrt(mean_squared_error(y_test, final_model.predict(X_test)))

In [35]:
print("Final Test RMSE : ", test_rmse)

Final Test RMSE :  108876.27198278366


### 1️⃣ Split raw data → Train + Test
      (Test is locked away)

### 2️⃣ On TRAIN ONLY:
      → Cross-Validation OR Validation split
      → Model selection
      → Hyperparameter tuning

### 3️⃣ Freeze everything:
      → model
      → hyperparameters
      → preprocessing

### 4️⃣ Evaluate ONCE on Test set
      → Final generalization estimate
