# ML Pipeline 

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/Users/allaafaham/Library/Mobile Documents/com~apple~CloudDocs/cursor projects/house-prices/notebooks'

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


In [3]:
current_dir = os.getcwd()
current_dir

'/Users/allaafaham/Library/Mobile Documents/com~apple~CloudDocs/cursor projects/house-prices'

# Load data

In [15]:
import pandas as pd


# Read the dataset
df = pd.read_csv("outputs/datasets/cleaned/house-prices-records-relevantfeatures.csv")

# Display basic information
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.shape
df.info()



Dataset Shape: (1460, 9)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SalePrice    1460 non-null   int64  
 1   OverallQual  1460 non-null   int64  
 2   GrLivArea    1460 non-null   int64  
 3   GarageArea   1460 non-null   int64  
 4   YearBuilt    1460 non-null   int64  
 5   GarageYrBlt  1460 non-null   float64
 6   TotalBsmtSF  1460 non-null   int64  
 7   1stFlrSF     1460 non-null   int64  
 8   KitchenQual  1460 non-null   object 
dtypes: float64(1), int64(7), object(1)
memory usage: 102.8+ KB


In [18]:
# split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['SalePrice'], axis=1),
    df['SalePrice'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1168, 8) (1168,) (292, 8) (292,)


In [21]:
# Preprocessing

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Features
num_features = ['GrLivArea', 'GarageArea', 'YearBuilt', 'GarageYrBlt', 'TotalBsmtSF', '1stFlrSF',]
cat_features = ['KitchenQual']
passthrough_features = ['OverallQual']

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']]), cat_features),
    ('pass', 'passthrough', passthrough_features)
])



In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    results[name] = {
        'R²': r2_score(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred))
    }

In [23]:
import pandas as pd

results_df = pd.DataFrame(results).T.sort_values(by='R²', ascending=False)
print(results_df)

                         R²           MAE          RMSE
Gradient Boosting  0.878610  19054.476879  28953.477898
Random Forest      0.847877  19604.076792  32411.981728
Decision Tree      0.781422  25972.229452  38851.814647
Linear Regression  0.651131  25820.634349  49083.966518


In [25]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

final_pipeline.fit(X_train, y_train)

In [26]:
import joblib

# Save the pipeline to a file
joblib.dump(final_pipeline, 'gradient_boosting_pipeline.pkl')

['gradient_boosting_pipeline.pkl']

In [27]:
loaded_model = joblib.load('gradient_boosting_pipeline.pkl')
prediction = loaded_model.predict(X_test[:1])
print("Prediction:", prediction)

Prediction: [182670.07377502]
