LETS BEGIN

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [2]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,ElasticNet,Lasso,Ridge
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
df = df.drop(columns=['Id','Alley','MasVnrType','PoolQC','Fence','MiscFeature'],axis = 1)

In [6]:
# 1. Combine similar features
df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
df['TotalBath'] = df['FullBath'] + df['HalfBath'] + df['BsmtFullBath'] + df['BsmtHalfBath']
df['TotalPorch'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']

# 2. Age features
df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']

# 3. Yes/No features
df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
df['HasPool'] = (df['PoolArea'] > 0).astype(int)
df['HasBasement'] = (df['TotalBsmtSF'] > 0).astype(int)
df['Has2ndFloor'] = (df['2ndFlrSF'] > 0).astype(int)

# 4. Simple ratios
df['LivAreaRatio'] = df['GrLivArea'] / df['LotArea']
df['GarageRatio'] = df['GarageArea'] / (df['TotalSF'] + 1)  # +1 to avoid division by zero

# 5. Quality average
df['AvgQuality'] = (df['OverallQual'] + df['OverallCond']) / 2

# 6. Convert YrSold to object for encoding later
df['YrSold'] = df['YrSold'].astype(str)

# 7. Fill any missing values in new features
new_features = ['TotalSF', 'TotalBath', 'TotalPorch', 'HouseAge', 'RemodAge', 
                'LivAreaRatio', 'GarageRatio', 'AvgQuality']
for col in new_features:
    df[col] = df[col].fillna(0)

print("New features created successfully!")

New features created successfully!


In [7]:
X = df.drop('SalePrice',axis = 1)
y = df['SalePrice']

In [8]:
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF

In [9]:
X_numeric = X.select_dtypes(include = ['int64','float64']).columns.tolist()
X_categorical = X.select_dtypes(include = ['object']).columns.tolist()

In [10]:
low_card_cat = [col for col in X_categorical if X[col].nunique() <= 5]
high_card_cat = [col for col in X_categorical if X[col].nunique() > 5]

In [11]:
for column in X_numeric:
    skew_value = df[column].skew()
    if skew_value > 0.5 or skew_value < -0.5:
        print(f"Skewed   ({skew_value:.3f}): {column}")
    else:
        print(f"Normal   ({skew_value:.3f}): {column}")


Skewed   (1.408): MSSubClass
Skewed   (2.164): LotFrontage
Skewed   (12.208): LotArea
Normal   (0.217): OverallQual
Skewed   (0.693): OverallCond
Skewed   (-0.613): YearBuilt
Skewed   (-0.504): YearRemodAdd
Skewed   (2.669): MasVnrArea
Skewed   (1.686): BsmtFinSF1
Skewed   (4.255): BsmtFinSF2
Skewed   (0.920): BsmtUnfSF
Skewed   (1.524): TotalBsmtSF
Skewed   (1.377): 1stFlrSF
Skewed   (0.813): 2ndFlrSF
Skewed   (9.011): LowQualFinSF
Skewed   (1.367): GrLivArea
Skewed   (0.596): BsmtFullBath
Skewed   (4.103): BsmtHalfBath
Normal   (0.037): FullBath
Skewed   (0.676): HalfBath
Normal   (0.212): BedroomAbvGr
Skewed   (4.488): KitchenAbvGr
Skewed   (0.676): TotRmsAbvGrd
Skewed   (0.650): Fireplaces
Skewed   (-0.649): GarageYrBlt
Normal   (-0.343): GarageCars
Normal   (0.180): GarageArea
Skewed   (1.541): WoodDeckSF
Skewed   (2.364): OpenPorchSF
Skewed   (3.090): EnclosedPorch
Skewed   (10.304): 3SsnPorch
Skewed   (4.122): ScreenPorch
Skewed   (14.828): PoolArea
Skewed   (24.477): MiscVal
No

In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('yeo-johnson',PowerTransformer(method='yeo-johnson')),
    ('scaler',StandardScaler())
])

In [13]:
ordinal_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ode',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [14]:
nominal_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ode', OneHotEncoder(sparse_output=False, drop='if_binary', handle_unknown='ignore'))
])

In [15]:
preprocessor = ColumnTransformer([
    ('num',numeric_transformer,X_numeric),
    ('ode',ordinal_transformer,low_card_cat),
    ('ohe',nominal_transformer,high_card_cat)
])

In [16]:
models = {
    'Lr':LinearRegression(),
    'L2':Ridge(),
    'L1':Lasso(),
    'ElasticNet':ElasticNet()
}


In [17]:
param_grids = {
    'Lr': {},
    'L2': {'model__alpha': [0.01, 0.1, 1, 10]},
    'L1': {'model__alpha': [0.01, 0.1, 1]},
    'ElasticNet': {
        'model__alpha': [0.01, 0.1, 1],
        'model__l1_ratio': [0.2, 0.5, 0.8]
    }
}

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
for name, model in models.items():
    
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])

    grid = GridSearchCV(pipe, param_grids[name], cv=10, scoring='r2')
    grid.fit(X_train,y_train)
    y_pred = grid.predict(X_test)
    print(model,':',r2_score(y_test,y_pred))
print("Best Params:", grid.best_params_)


LinearRegression() : -3.7462434266629696
Ridge() : 0.869698262751974
Lasso() : 0.8765330704747208
ElasticNet() : 0.810694669702855
Best Params: {'model__alpha': 1, 'model__l1_ratio': 0.2}


In [20]:
models_tree = {
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbosity=0)
}

In [21]:
param_grids_tree = {
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [5, 10, 15]
    },
    'GradientBoosting': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 5]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 5]
    }
}

In [22]:
for name, model in models_tree.items():
    
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])

    grid = GridSearchCV(pipe, param_grids_tree[name], cv=10, scoring='r2')
    grid.fit(X_train,y_train)
    y_pred = grid.predict(X_test)

    
    print(f"\n{name}")
    print("Best Params:", grid.best_params_)
    print("CV R² (Best Score):", grid.best_score_)
    print("Test R²:", r2_score(y_test, y_pred))



RandomForest
Best Params: {'model__max_depth': 15, 'model__n_estimators': 100}
CV R² (Best Score): 0.8505308691325573
Test R²: 0.888297696441947

GradientBoosting
Best Params: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 200}
CV R² (Best Score): 0.8702199736091846
Test R²: 0.909214412204057

XGBoost
Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
CV R² (Best Score): 0.8741499423980713
Test R²: 0.9154930114746094


In [23]:
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
Best Score: 0.8741499423980713


In [24]:
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        learning_rate=0.1,
        max_depth=3,
        n_estimators=200,
        random_state=42
    ))
])

final_model.fit(X_train, y_train)
final_preds = final_model.predict(X_test)
print("Final Test R²:", r2_score(y_test, final_preds))


Final Test R²: 0.9154930114746094


In [25]:
import joblib
joblib.dump(final_model, 'xgb_pipeline_model.pkl')


['xgb_pipeline_model.pkl']

In [26]:
import matplotlib.pyplot as plt

xgb = final_model.named_steps['model']
importances = xgb.feature_importances_

# Get column names after preprocessing
feature_names = final_model.named_steps['preprocessor'].get_feature_names_out()
feature_names


array(['num__MSSubClass', 'num__LotFrontage', 'num__LotArea',
       'num__OverallQual', 'num__OverallCond', 'num__YearBuilt',
       'num__YearRemodAdd', 'num__MasVnrArea', 'num__BsmtFinSF1',
       'num__BsmtFinSF2', 'num__BsmtUnfSF', 'num__TotalBsmtSF',
       'num__1stFlrSF', 'num__2ndFlrSF', 'num__LowQualFinSF',
       'num__GrLivArea', 'num__BsmtFullBath', 'num__BsmtHalfBath',
       'num__FullBath', 'num__HalfBath', 'num__BedroomAbvGr',
       'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
       'num__GarageYrBlt', 'num__GarageCars', 'num__GarageArea',
       'num__WoodDeckSF', 'num__OpenPorchSF', 'num__EnclosedPorch',
       'num__3SsnPorch', 'num__ScreenPorch', 'num__PoolArea',
       'num__MiscVal', 'num__MoSold', 'num__TotalSF', 'num__TotalBath',
       'num__TotalPorch', 'num__HouseAge', 'num__RemodAge',
       'num__HasGarage', 'num__HasFireplace', 'num__HasPool',
       'num__HasBasement', 'num__Has2ndFloor', 'num__LivAreaRatio',
       'num__GarageRatio', 

In [27]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# After training your model and making predictions on test set:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.1f}%")

R² Score: 0.915
MAE: 15381.75
RMSE: 25459.70
MAPE: 9.2%


In [28]:
X_test.to_csv('test_data.csv', index=False)

In [29]:
# Save y_test as pickle
import joblib
joblib.dump(y_test, 'y_test.pkl')

['y_test.pkl']