In [386]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [387]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

Feature Engineering

In [388]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split

In [389]:
data = pd.read_csv("../data/train.csv")
# X_train.drop(columns=['Id'],inplace=True)
X_train, X_test = train_test_split(data,test_size=0.2,random_state=42)
Y_train = X_train.pop('SalePrice')
X_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [390]:
from sklearn.base import BaseEstimator, TransformerMixin

class DropNaHeavyColumns(BaseEstimator,TransformerMixin):
    def __init__(self,threshold:int = 0.6):
       self.drop_cols = []
       self.threshold = threshold

    def fit(self, X, y=None):
        self.drop_cols = [col for col in X if X[col].isna().sum()>=X.shape[0]*self.threshold]
        # print('drop_na',len(self.drop_cols))
        return self

    def transform(self, X:pd.DataFrame):
        X_transformed = X.copy()
        X_transformed.drop(columns=self.drop_cols,inplace=True)
        return X_transformed

In [391]:
class CustomImputer(BaseEstimator,TransformerMixin):
    def __init__(self, strategy:str = 'most_frequent'):
       self.imputer = None
       self.strategy = strategy

    def fit(self, X, y=None):
        X_copy = X.copy()
        self.imputer = SimpleImputer(strategy=self.strategy)
        self.imputer.fit(X_copy)
        return self

    def transform(self, X:pd.DataFrame):
        X_transformed = X.copy()
        ret_val = pd.DataFrame(self.imputer.transform(X),columns=X_transformed.columns)

        return ret_val.astype(X_transformed.dtypes)

In [392]:
class CustomColumnTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,threshold:int = 3):
       self.ordinal = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
       self.one_hot = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
       self.one_hot_cols = []
       self.ordinal_cols = []
       self.threshold = threshold

    def fit(self, X:pd.DataFrame, y=None):
        X_copy = X.copy()
        self.ordinal_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique()>self.threshold]
        self.one_hot_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique()<=self.threshold]
        self.ordinal.fit(X_copy[self.ordinal_cols])
        self.one_hot.fit(X_copy[self.one_hot_cols])
        return self

    def transform(self, X:pd.DataFrame):
        X_copy = X.copy()
        X_copy[self.ordinal_cols] = pd.DataFrame(self.ordinal.transform(X_copy[self.ordinal_cols]),columns=self.ordinal_cols)
        hot_cols = X_copy[self.one_hot_cols] 
        one_hot =  pd.DataFrame(self.one_hot.transform(hot_cols),columns=self.one_hot.get_feature_names_out())
        X_copy.drop(columns=self.one_hot_cols,inplace=True)
        return pd.concat([X_copy,one_hot],axis=1)


In [414]:
class Scaler(BaseEstimator,TransformerMixin):
    def __init__(self, scaler = StandardScaler()):
        self.scaler = scaler
        self.cat_cols = []

    def fit(self, X:pd.DataFrame, y=None):
        if self.scaler == None:
            return self
        self.cat_cols = [c for c in X if X[c].dtype != 'object']
        self.scaler.fit(X[self.cat_cols].copy())
        return self

    def transform(self, X):
        X_copy = X.copy()
        if self.scaler == None:
            return X_copy
        X_copy[self.cat_cols] = pd.DataFrame(self.scaler.transform(X_copy[self.cat_cols]),columns=self.cat_cols)
        return X_copy

Feature Selection

In [394]:
class RFE(BaseEstimator,TransformerMixin):
    def __init__(self, scaler = StandardScaler()):
       self.scaler = scaler

    def fit(self, X:pd.DataFrame, y=None):
        self.scaler.fit(X.copy())
        return self

    def transform(self, X):
        X_copy = X.copy()
        return self.scaler.transform(X_copy)    

In [395]:
class CorrelationFilter(BaseEstimator,TransformerMixin):
    def __init__(self, threshold:int = 0.7):
        self.threshold = threshold
        self.features_to_drop = []

    def fit(self, X:pd.DataFrame, y):
        X_copy = X.copy()
        corr_matrix = X_copy.corr().abs()
        high_corr_pairs = []

        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if corr_matrix.iloc[i, j] > self.threshold:
                    high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))


        features_to_drop = []
        for feat1, feat2, _ in high_corr_pairs:
            if abs(X[feat1].corr(y)) < abs(X[feat2].corr(y)):
                features_to_drop.append(feat1)
            else:
                features_to_drop.append(feat2)
        self.features_to_drop = features_to_drop
        print('Dropped Features',len(self.features_to_drop))
        return self

    def transform(self, X:pd.DataFrame):
        X_copy = X.copy()
        return X_copy.drop(columns=self.features_to_drop)

Training

In [396]:
pipeline = Pipeline(steps=[
    ('dropna', DropNaHeavyColumns(0.5)),
    ('imputer', CustomImputer()),
    ('Scaler', Scaler(RobustScaler())),
    ('cat2num', CustomColumnTransformer(threshold=30)),
    ('corr_filter', CorrelationFilter(0.7)),
    ('Model', LinearRegression())
])

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=40)

scalers = [
    MinMaxScaler(),
    RobustScaler(),
    None
]


param_grid = {
    # 'cat2num__threshold': [7,10,20,30,50],
    # 'Scaler__scaler': scalers,
    # 'dropna__threshold' : [0.4,0.5,0.6,0.7,0.8,1], 
    # 'corr_filter__threshold': [0.6,0.7,0.8,0.9,0.95,0.98]
}

scoring = {
    'mae': 'neg_mean_absolute_error',
    'rmse': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

grid_search = GridSearchCV(
    pipeline,
    cv=kfold,
    scoring=scoring,
    return_train_score=True,
    refit='r2',
    verbose=0,
    param_grid=param_grid
)

In [417]:
grid_search.fit(X_train,Y_train)

  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 46


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 45


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 47
Dropped Features 48


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 45


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 46


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 45


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 47
Dropped Features 48


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 45


5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Achi\Desktop\ML01_House-Prices\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Achi\Desktop\ML01_House-Prices\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Achi\Desktop\ML01_House-Prices\.venv\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^

Dropped Features 44


In [418]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'Scaler__scaler': MinMaxScaler()}
Best cross-validation score: 0.7743


In [419]:
results = pd.DataFrame(grid_search.cv_results_)

In [420]:
results = results.sort_values(by='mean_test_rmse', ascending=False)

In [421]:
cv_results =  results[['params','mean_test_mae','mean_train_mae','mean_test_r2','mean_train_r2','mean_test_rmse','mean_train_rmse']]

In [422]:
# for c in cv_results.index:
#     print(cv_results.iloc[c].iloc[0],cv_results.iloc[c].iloc[1],cv_results.iloc[c].iloc[2],cv_results.iloc[c].iloc[3],cv_results.iloc[c].iloc[4],cv_results.iloc[c].iloc[5],cv_results.iloc[c].iloc[6])

cv_results

Unnamed: 0,params,mean_test_mae,mean_train_mae,mean_test_r2,mean_train_r2,mean_test_rmse,mean_train_rmse
0,{'Scaler__scaler': MinMaxScaler()},-18874.202244,-13338.902943,0.774251,0.934605,-35485.021826,-19740.107482
1,{'Scaler__scaler': RobustScaler()},-18874.202244,-13338.902943,0.774251,0.934605,-35485.021826,-19740.107482
2,{'Scaler__scaler': None},,,,,,


In [404]:
grid_search.best_estimator_

In [405]:
from sklearn.model_selection import cross_validate

scoring = {
    'mae': 'neg_mean_absolute_error',
    'rmse': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

# p = Pipeline(pipeline.steps)

results = cross_validate(pipeline, X_train, Y_train, cv=KFold(shuffle=True,random_state=40), scoring=scoring, return_train_score=True)

train_rmse = -1*np.mean(results['train_rmse'])
test_rmse = -1*np.mean(results['test_rmse'])

print(f"train rmse: {train_rmse:.4f}")
print(f"test rmse: {test_rmse:.4f}")


train_r2 = np.mean(results['train_r2'])
test_r2 = np.mean(results['test_r2'])

print(f"train r2: {(train_r2):.4f}")
print(f"test r2: {(test_r2):.4f}")

train_mae = -1*np.mean(results['train_mae'])
test_mae = -1*np.mean(results['test_mae'])

print(f"train mae: {train_mae:.4f}")
print(f"test mae: {test_mae:.4f}")


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 46


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 45


  c /= stddev[:, None]
  c /= stddev[None, :]


Dropped Features 47
Dropped Features 48
Dropped Features 45
train rmse: 19740.1075
test rmse: 35485.0218
train r2: 0.9346
test r2: 0.7743
train mae: 13338.9029
test mae: 18874.2022


  c /= stddev[:, None]
  c /= stddev[None, :]


Upload to Dagshub

In [410]:
import dagshub
dagshub.init(repo_owner='azhgh22', repo_name='ML01_House-Prices', mlflow=True)

In [407]:
pipeline.get_params()['steps']
grid_search.best_params_

{}

In [408]:
param_grid

{}

In [413]:
import mlflow
mlflow.set_experiment("Experiment 1: Linear Regression")
with mlflow.start_run(run_name="run1"):
    mlflow.log_params({
        'droped features threshold': 'droped (NA count) >= (row count)*0.5',
        'Imputer' : 'fill with mode',
        'Cat2num' : 'OrdinalEncoder for features having 30+ categorical values, else OneHotEncoding',
        'Correlation filter thresholds' : 0.7,
        'Scaler' : 'RobustScaler',
        'kfold splits': 5,
        'ModelType' : 'LinearRegression',
        'Score' : 'neg_root_mean_squared_error',
    })
    mlflow.log_metrics({
        'train_rmse': train_rmse,
        'test_rmse' : test_rmse,
        'train_r2' : train_r2,
        'test_r2' : test_r2,
        'train_mae' : train_mae,
        'test_mae' : test_mae
    })
    mlflow.sklearn.log_model(pipeline.fit(X_train,Y_train),'LinregressionPipeline')
    
    mlflow.end_run()

Dropped Features 44




🏃 View run run1 at: https://dagshub.com/azhgh22/ML01_House-Prices.mlflow/#/experiments/1/runs/16864f97bb2946b3918a6e7bba5ed9b9
🧪 View experiment at: https://dagshub.com/azhgh22/ML01_House-Prices.mlflow/#/experiments/1
