In [23]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [24]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,ElasticNet,Lasso,Ridge
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 

In [25]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [27]:
df = df.drop(columns=['Id','Alley','MasVnrType','PoolQC','Fence','MiscFeature'],axis = 1)

In [28]:
X = df.drop('SalePrice',axis = 1)
y = df['SalePrice']

In [29]:
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF

In [30]:
X_numeric = X.select_dtypes(include = ['int64','float64']).columns.tolist()
X_categorical = X.select_dtypes(include = ['object']).columns.tolist()

In [31]:
low_card_cat = [col for col in X_categorical if X[col].nunique() <= 5]
high_card_cat = [col for col in X_categorical if X[col].nunique() > 5]

In [32]:
for column in X_numeric:
    skew_value = df[column].skew()
    if skew_value > 0.5 or skew_value < -0.5:
        print(f"Skewed   ({skew_value:.3f}): {column}")
    else:
        print(f"Normal   ({skew_value:.3f}): {column}")


Skewed   (1.408): MSSubClass
Skewed   (2.164): LotFrontage
Skewed   (12.208): LotArea
Normal   (0.217): OverallQual
Skewed   (0.693): OverallCond
Skewed   (-0.613): YearBuilt
Skewed   (-0.504): YearRemodAdd
Skewed   (2.669): MasVnrArea
Skewed   (1.686): BsmtFinSF1
Skewed   (4.255): BsmtFinSF2
Skewed   (0.920): BsmtUnfSF
Skewed   (1.524): TotalBsmtSF
Skewed   (1.377): 1stFlrSF
Skewed   (0.813): 2ndFlrSF
Skewed   (9.011): LowQualFinSF
Skewed   (1.367): GrLivArea
Skewed   (0.596): BsmtFullBath
Skewed   (4.103): BsmtHalfBath
Normal   (0.037): FullBath
Skewed   (0.676): HalfBath
Normal   (0.212): BedroomAbvGr
Skewed   (4.488): KitchenAbvGr
Skewed   (0.676): TotRmsAbvGrd
Skewed   (0.650): Fireplaces
Skewed   (-0.649): GarageYrBlt
Normal   (-0.343): GarageCars
Normal   (0.180): GarageArea
Skewed   (1.541): WoodDeckSF
Skewed   (2.364): OpenPorchSF
Skewed   (3.090): EnclosedPorch
Skewed   (10.304): 3SsnPorch
Skewed   (4.122): ScreenPorch
Skewed   (14.828): PoolArea
Skewed   (24.477): MiscVal
No

In [33]:
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('yeo-johnson',PowerTransformer(method='yeo-johnson'))
])

In [34]:
ordinal_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ode',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [35]:
nominal_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ode', OneHotEncoder(sparse_output=False, drop='if_binary', handle_unknown='ignore'))
])

In [36]:
preprocessor = ColumnTransformer([
    ('num',numeric_transformer,X_numeric),
    ('ode',ordinal_transformer,low_card_cat),
    ('ohe',nominal_transformer,high_card_cat)
])

In [37]:
models = {
    'Lr':LinearRegression(),
    'L2':Ridge(),
    'L1':Lasso(),
    'ElasticNet':ElasticNet()
}


In [38]:
param_grids = {
    'Lr': {},
    'L2': {'model__alpha': [0.01, 0.1, 1, 10]},
    'L1': {'model__alpha': [0.01, 0.1, 1]},
    'ElasticNet': {
        'model__alpha': [0.01, 0.1, 1],
        'model__l1_ratio': [0.2, 0.5, 0.8]
    }
}

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [40]:
for name, model in models.items():
    
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])

    grid = GridSearchCV(pipe, param_grids[name], cv=10, scoring='r2')
    grid.fit(X_train,y_train)
    y_pred = grid.predict(X_test)
    print(model,':',r2_score(y_test,y_pred))
print("Best Params:", grid.best_params_)


LinearRegression() : -4.768097858015575
Ridge() : 0.8514548228424439
Lasso() : 0.8646279269675092
ElasticNet() : 0.8471779015018532
Best Params: {'model__alpha': 0.1, 'model__l1_ratio': 0.8}


In [41]:
models_tree = {
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbosity=0)
}

In [42]:
param_grids_tree = {
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [5, 10, 15]
    },
    'GradientBoosting': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 5]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 5]
    }
}

In [45]:
for name, model in models_tree.items():
    
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])

    grid = GridSearchCV(pipe, param_grids_tree[name], cv=10, scoring='r2')
    grid.fit(X_train,y_train)
    y_pred = grid.predict(X_test)

    
    print(f"\n{name}")
    print("Best Params:", grid.best_params_)
    print("CV R² (Best Score):", grid.best_score_)
    print("Test R²:", r2_score(y_test, y_pred))



RandomForest
Best Params: {'model__max_depth': 15, 'model__n_estimators': 100}
CV R² (Best Score): 0.8427633095490801
Test R²: 0.891172547856312

GradientBoosting
Best Params: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 200}
CV R² (Best Score): 0.8642667063722875
Test R²: 0.8942692005309819

XGBoost
Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
CV R² (Best Score): 0.8644922316074372
Test R²: 0.9106284379959106


In [46]:
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
Best Score: 0.8644922316074372


In [47]:
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        learning_rate=0.1,
        max_depth=3,
        n_estimators=200,
        random_state=42
    ))
])

final_model.fit(X_train, y_train)
final_preds = final_model.predict(X_test)
print("Final Test R²:", r2_score(y_test, final_preds))


Final Test R²: 0.9106284379959106


In [49]:
import joblib
joblib.dump(final_model, 'xgb_pipeline_model.pkl')


['xgb_pipeline_model.pkl']

In [51]:
import matplotlib.pyplot as plt

xgb = final_model.named_steps['model']
importances = xgb.feature_importances_

# Get column names after preprocessing
feature_names = final_model.named_steps['preprocessor'].get_feature_names_out()
feature_names


array(['num__MSSubClass', 'num__LotFrontage', 'num__LotArea',
       'num__OverallQual', 'num__OverallCond', 'num__YearBuilt',
       'num__YearRemodAdd', 'num__MasVnrArea', 'num__BsmtFinSF1',
       'num__BsmtFinSF2', 'num__BsmtUnfSF', 'num__TotalBsmtSF',
       'num__1stFlrSF', 'num__2ndFlrSF', 'num__LowQualFinSF',
       'num__GrLivArea', 'num__BsmtFullBath', 'num__BsmtHalfBath',
       'num__FullBath', 'num__HalfBath', 'num__BedroomAbvGr',
       'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
       'num__GarageYrBlt', 'num__GarageCars', 'num__GarageArea',
       'num__WoodDeckSF', 'num__OpenPorchSF', 'num__EnclosedPorch',
       'num__3SsnPorch', 'num__ScreenPorch', 'num__PoolArea',
       'num__MiscVal', 'num__MoSold', 'num__YrSold', 'ode__MSZoning',
       'ode__Street', 'ode__LotShape', 'ode__LandContour',
       'ode__Utilities', 'ode__LotConfig', 'ode__LandSlope',
       'ode__BldgType', 'ode__ExterQual', 'ode__ExterCond',
       'ode__BsmtQual', 'ode__BsmtCond'

In [52]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# After training your model and making predictions on test set:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.1f}%")

R² Score: 0.911
MAE: 16197.27
RMSE: 26182.23
MAPE: 9.8%


In [53]:
X_test.to_csv('test_data.csv', index=False)

In [54]:
# Save y_test as pickle
import joblib
joblib.dump(y_test, 'y_test.pkl')

['y_test.pkl']