<a href="https://colab.research.google.com/github/amanjaiswalofficial/machine-learning-engineer-projects/blob/main/HOML/04_Scikit_Learn_Estimators_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
from sklearn.datasets import fetch_california_housing

# Load dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# Display first few rows
df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# Check dataset info
df.info()

# Summary statistics
df.describe()

# Check for missing values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


In [None]:
# Define numerical and categorical features
numerical_features = df.drop(columns=['MedHouseVal']).columns  # All columns except target
categorical_features = []  # No categorical features in this dataset

# Create preprocessing pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values
    ('scaler', StandardScaler())  # Scale features
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features)
])

In [None]:
# Split data
X = df.drop(columns=['MedHouseVal'])
y = df['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Linear Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n {name} Performance:")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    print(f"R² Score: {r2_score(y_test, y_pred):.4f}")



 Linear Regression Performance:
MAE: 0.5332
RMSE: 0.7456
R² Score: 0.5758

 Random Forest Performance:
MAE: 0.3274
RMSE: 0.5051
R² Score: 0.8053


### Hyperparameter tuning

In [None]:
param_grid = {
    'model__n_estimators': [50, 100],
    'model__max_depth': [10, 20]
}

grid_search = GridSearchCV(models['Random Forest'], param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"\n📌 Tuned Random Forest Performance:")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")


Best Parameters: {'model__max_depth': 20, 'model__n_estimators': 100}

📌 Tuned Random Forest Performance:
MAE: 0.3278
RMSE: 0.5057
R² Score: 0.8049


In [None]:
# Predict on new data (using test set)
sample = X_test.iloc[:5]
predictions = best_model.predict(sample)

print("\nPredictions:")
print(predictions)



Predictions:
[0.51593347 0.73852561 4.9364372  2.50093201 2.26701923]


### What are estimators
Estimator is any object that implements the `.fit()` and `.predict()` methods. These include:

1. Preprocessing Estimators (e.g., StandardScaler, SimpleImputer)

2. Model Estimators (e.g., LinearRegression, RandomForestRegressor)
---
1. LinearRegression(): This Estimator learns the relationship between features and the target.
2. RandomForestRegressor(n_estimators=100): This Estimator builds multiple decision trees.
3. SimpleImputer(strategy='median'): This Estimator fills missing values with the median.
4. StandardScaler(): This Estimator scales numerical features.
5. GridSearchCV(estimator, param_grid, cv=3): This Estimator optimizes hyperparameters.

## Transformer

A Transformer in Scikit-Learn is any class that modifies input data and follows the Estimator API (fit() and transform())



In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# Define feature types
numerical_features = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]
categorical_features = []  # No categorical features in this dataset


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Pipeline for numerical features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  # Fill missing values with median
    ("scaler", StandardScaler())  # Standardize numerical features
])

# Combine all preprocessing steps
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features)  # Apply to numerical features
])

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Define input (X) and target (y)
X = df.drop(columns=["MedHouseVal"])  # Features
y = df["MedHouseVal"]  # Target variable (house price)

# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Full pipeline with preprocessing + model
pipeline = Pipeline([
    ("preprocessor", preprocessor),  # Apply transformations
    ("model", LinearRegression())  # Train Linear Regression model
])

# Train the Pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Print first 5 predictions
print("\nPredictions (First 5):")
print(y_pred[:5])



Predictions (First 5):
[0.71912284 1.76401657 2.70965883 2.83892593 2.60465725]


### Preprocessing + Multiple Models + Hyperparameter Tuning with Transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# Add a synthetic categorical feature for demonstration
df["House_Age_Category"] = pd.cut(df["HouseAge"], bins=[0, 20, 50, np.inf], labels=["New", "Mid", "Old"])

# Define feature types
numerical_features = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]
categorical_features = ["House_Age_Category"]

# Preprocessing pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  # Fill missing values
    ("scaler", StandardScaler())  # Standardize features
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # Convert categorical to numerical
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Split data
X = df.drop(columns=["MedHouseVal"])  # Features
y = df["MedHouseVal"]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models and evaluate
for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"\n📊 Model: {name}")
    print(f"🔹 MAE: {mae:.4f}")
    print(f"🔹 RMSE: {rmse:.4f}")
    print(f"🔹 R² Score: {r2:.4f}")

# Hyperparameter tuning for Random Forest
param_grid = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth": [10, 20, None]
}

grid_search = GridSearchCV(Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
]), param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)

grid_search.fit(X_train, y_train)

# Best Random Forest Model
best_rf = grid_search.best_estimator_
best_y_pred = best_rf.predict(X_test)

best_mae = mean_absolute_error(y_test, best_y_pred)
best_rmse = np.sqrt(mean_squared_error(y_test, best_y_pred))
best_r2 = r2_score(y_test, best_y_pred)

print("\n🏆 Best Random Forest Model (After Hyperparameter Tuning)")
print(f"🔹 Best Parameters: {grid_search.best_params_}")
print(f"🔹 MAE: {best_mae:.4f}")
print(f"🔹 RMSE: {best_rmse:.4f}")
print(f"🔹 R² Score: {best_r2:.4f}")



📊 Model: Linear Regression
🔹 MAE: 0.5294
🔹 RMSE: 0.7399
🔹 R² Score: 0.5822

📊 Model: Random Forest
🔹 MAE: 0.3277
🔹 RMSE: 0.5065
🔹 R² Score: 0.8042

📊 Model: Gradient Boosting
🔹 MAE: 0.3718
🔹 RMSE: 0.5430
🔹 R² Score: 0.7750

🏆 Best Random Forest Model (After Hyperparameter Tuning)
🔹 Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}
🔹 MAE: 0.3269
🔹 RMSE: 0.5045
🔹 R² Score: 0.8058
