In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
import os              
import joblib  

In [2]:
MODEL_FILE = "housing_model.pkl"                
PIPELINE_FILE = "housing_pipeline.pkl"         


def build_pipeline(num_attribute, cat_attribute):
    # for Numerical Columns
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    
    # for Categorical Columns
    cat_pipeline = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore"))    
    ])

    # Construct Full Pipeline:
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribute),
        ("cat", cat_pipeline, cat_attribute)
    ])
    return full_pipeline



if not os.path.exists(MODEL_FILE):
    housing = pd.read_csv("housing.csv")
    
    housing['income_cat'] = pd.cut(
        housing["median_income"],
        bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
        labels=[1, 2, 3, 4, 5]
    )
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    
    for train_set, test_set in split.split(housing, housing['income_cat']):
        housing.iloc[test_set].drop("income_cat", axis=1).to_csv("input.csv", index=False)   
        housing = housing.iloc[train_set].drop("income_cat", axis=1)   

    
    housing_labels = housing["median_house_value"].copy()              
    housing_features = housing.drop("median_house_value", axis=1)      


    num_attribute = housing_features.drop("ocean_proximity", axis=1).columns.tolist()   
    cat_attribute = ["ocean_proximity"]

    # Build Pipeline and Transform data
    pipeline = build_pipeline(num_attribute, cat_attribute)
    housing_prepared = pipeline.fit_transform(housing_features)
    housing_prepared

    # Train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(housing_prepared, housing_labels)

    # Save the model and pipeline
    joblib.dump(model, MODEL_FILE)     
    joblib.dump(pipeline, PIPELINE_FILE)      
    print("Model is trained")       

else:                              
    # Inference
    model = joblib.load(MODEL_FILE)
    pipeline = joblib.load(PIPELINE_FILE)

    input_data = pd.read_csv("input.csv")    
    transformed_input = pipeline.transform(input_data)
    predictions = model.predict(transformed_input)
    input_data['median_house_values'] = predictions

    input_data.to_csv("output.csv", index=False)
    print("Inference is completed, results saved to output.csv")
    

Model is trained
