In [331]:
#import relevant packages 
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats
from IPython.display import display, HTML
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import OrthogonalMatchingPursuit


In [332]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print(train.shape,test.shape)

(1460, 81) (1459, 80)


In [333]:
train.columns


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [334]:
# Function to create scrollable table within a small window
def create_scrollable_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [335]:
# Summary statistics for numerical features
numerical_features =train.select_dtypes(include=[np.number])
summary_stats = numerical_features.describe().T
html_numerical = create_scrollable_table(summary_stats, 'numerical_features', 'Summary statistics for numerical features')

display(HTML(html_numerical))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [336]:
# Summary statistics for categorical  features
categorical_features =train.select_dtypes(include=['object'])
summary_stats = categorical_features.describe().T
html_categorical = create_scrollable_table(summary_stats, 'categorical_features', 'Summary statistics for categorical features')

display(HTML(html_categorical))

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [337]:
# Get missing values and calculate percentages
missing = train.isna().sum()[train.isna().sum() > 0].sort_values(ascending=False)
df_missing = pd.DataFrame(missing, columns=['Missing'])
df_missing['Percent'] = df_missing['Missing'] * 100 / train.shape[0]

# Display scrollable table for missing values
table_id = 'missing-table'
title = 'Missing Values'
html = create_scrollable_table(df_missing, table_id, title)
display(HTML(html))

Unnamed: 0,Missing,Percent
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageType,81,5.547945
GarageYrBlt,81,5.547945
GarageFinish,81,5.547945
GarageQual,81,5.547945


In [338]:
def handle_missing_values(data):
    # Handle missing values in Garage related columns
    garage_columns = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    for col in garage_columns:
        data[col].fillna('No Garage', inplace=True)
    
    # Handle missing values in Basement related columns
    basement_columns = ['BsmtFinType1', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2']
    for col in basement_columns:
        data[col].fillna('No Basement', inplace=True)
    
    # Fill missing values in other categorical features
    data['PoolQC'].fillna('No Pool', inplace=True)
    data['Fence'].fillna('No Fence', inplace=True)
    data['MiscFeature'].fillna('No more Feat', inplace=True)
    data['FireplaceQu'].fillna('No Fire place', inplace=True)
    data['Alley'].fillna('No Alley', inplace=True)
    data['MasVnrType'].fillna('No Veneer', inplace=True)
    data['Electrical'].fillna(data['Electrical'].mode()[0], inplace=True)
    
    # Fill missing values in numeric features
    data['GarageYrBlt'].fillna(0, inplace=True)
    data['LotFrontage'].fillna(data['LotFrontage'].mean(), inplace=True)
    data['MasVnrArea'].fillna(0, inplace=True)

# Apply the function to train data
train1 = train.copy()
handle_missing_values(train1)




In [339]:
def check_missing_values(df):
    # Check for missing values
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    
    if missing_values.empty:
        print("No missing values in the dataset.")
    else:
        print("Missing values in the dataset:")
        print(missing_values)
check_missing_values(train1)

No missing values in the dataset.


## Explanatory data analysis 

* tagret variable

* variable numerical distributions

* Are there any correlations between numerical features and the target variable 'SalePrice'? Which features show the strongest correlations?


* What are the different values for categorical features like 'MSZoning', 'Street', 'Alley'  and their impact on the target price ? 

* What is the Average sale price by property shape?

* Is there a Correlation between Property Age and Sale Price

* Is there a Correlation between Living Area and Sale Price

*  Does price change year to year?

In [340]:
# Fit a normal distribution to the SalePrice data
mu, sigma = stats.norm.fit(train1['SalePrice'])

# Create a histogram of the SalePrice column
hist_data = go.Histogram(x=train1['SalePrice'], nbinsx=50, name="Histogram", opacity=0.75, histnorm='probability density', marker=dict(color='purple'))

# Calculate the normal distribution based on the fitted parameters
x_norm = np.linspace(train1['SalePrice'].min(),train1['SalePrice'].max(), 100)
y_norm = stats.norm.pdf(x_norm, mu, sigma)

# Create the normal distribution overlay
norm_data = go.Scatter(x=x_norm, y=y_norm, mode="lines", name=f"Normal dist. (μ={mu:.2f}, σ={sigma:.2f})", line=dict(color="green"))

# Combine the histogram and the overlay
fig = go.Figure(data=[hist_data, norm_data])

# Set the layout for the plot
fig.update_layout(
    title="SalePrice Distribution",
    xaxis_title="SalePrice",
    yaxis_title="Density",
    legend_title_text="Fitted Normal Distribution",
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Create a Q-Q plot
qq_data = stats.probplot(train1['SalePrice'], dist="norm")
qq_fig = px.scatter(x=qq_data[0][0], y=qq_data[0][1], labels={'x': 'Theoretical Quantiles', 'y': 'Ordered Values'}, color_discrete_sequence=["purple"])
qq_fig.update_layout(
    title="Q-Q plot",
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Calculate the line of best fit
slope, intercept, r_value, p_value, std_err = stats.linregress(qq_data[0][0], qq_data[0][1])
line_x = np.array(qq_data[0][0])
line_y = intercept + slope * line_x

# Add the line of best fit to the Q-Q plot
line_data = go.Scatter(x=line_x, y=line_y, mode="lines", name="Normal Line", line=dict(color="green"))

# Update the Q-Q plot with the normal line
qq_fig.add_trace(line_data)

# Show the plots
fig.show()
qq_fig.show()


In [341]:
import plotly.graph_objects as go

# Select the numerical features to analyze
numerical_features = ['LotFrontage', 'LotArea','BsmtFinSF1']

# Iterate over each numerical feature
for feature in numerical_features:
    # Fit a normal distribution to the feature data
    mu, sigma = stats.norm.fit(train1[feature])
    
    # Create a histogram of the feature column
    hist_data = go.Histogram(x=train1[feature], nbinsx=50, name="Histogram", opacity=0.75, histnorm='probability density', marker=dict(color='purple'))
    
    # Calculate the normal distribution based on the fitted parameters
    x_norm = np.linspace(train1[feature].min(), train1[feature].max(), 100)
    y_norm = stats.norm.pdf(x_norm, mu, sigma)
    
    # Create the normal distribution overlay
    norm_data = go.Scatter(x=x_norm, y=y_norm, mode="lines", name=f"Normal dist. (μ={mu:.2f}, σ={sigma:.2f})", line=dict(color="green"))
    
    # Create the figure
    fig = go.Figure(data=[hist_data, norm_data])
    
    # Set the layout for the figure
    fig.update_layout(
        title=f"Distribution of {feature}",
        xaxis_title=feature,
        yaxis_title="Density",
        plot_bgcolor='rgba(32, 32, 32, 1)',
        paper_bgcolor='rgba(32, 32, 32, 1)',
        font=dict(color='white')
    )
    
    # Show the figure
    fig.show()


In [342]:
def generate_bar_plot(data, x_column, y_column, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(x=data.index, y=data.values,
                         marker=dict(color=data.values, colorscale='aggrnyl'), text=data.values,
                         texttemplate='$%{text:,.0f}', textposition='outside'))
    
    fig.update_layout(title=title, xaxis_title=x_column, yaxis_title=y_column,
                      plot_bgcolor='rgba(32, 32, 32, 1)', paper_bgcolor='rgba(32, 32, 32, 1)',
                      font=dict(color='white'), uniformtext_minsize=8, uniformtext_mode='hide')
    
    fig.update_yaxes(tickprefix='$', tickformat=',')
    
    fig.show()

# Average Sale Price by Zoning
if 'MSZoning' in train1.columns:
    zoning_prices = train1.groupby('MSZoning')['SalePrice'].mean()
    generate_bar_plot(zoning_prices, 'MSZoning', 'SalePrice', 'Average Sale Price by Zoning')
else:
    print("Column 'MSZoning' not found in the DataFrame.")

# Average Sale Price by Alley
if 'Alley' in train1.columns:
    Alley_prices = train1.groupby('Alley')['SalePrice'].mean()
    generate_bar_plot(Alley_prices, 'Alley', 'SalePrice', 'Average Sale Price by Alley')
else:
    print("Column 'Alley' not found in the DataFrame.")

# Average Sale Price by Street
if 'Street' in train1.columns:
    Street_prices = train1.groupby('Street')['SalePrice'].mean()
    generate_bar_plot(Street_prices, 'Street', 'SalePrice', 'Average Sale Price by Alley')
else:
    print("Column 'Street' not found in the DataFrame.")

In [343]:
# Average Sale Price by Lotshape
if 'LotShape' in train1.columns:
    LotShape_prices = train1.groupby('LotShape')['SalePrice'].mean()
    generate_bar_plot(LotShape_prices, 'LotShape', 'SalePrice', 'Average Sale Price by LotShape')
else:
    print("Column 'LotShape' not found in the DataFrame.")
    
# Average Sale Price by LandContour
if 'LandContour' in train1.columns:
    LandContour_prices = train1.groupby('LandContour')['SalePrice'].mean()
    generate_bar_plot(LandContour_prices, 'LandContour', 'SalePrice', 'Average Sale Price by LandContour')
else:
    print("Column 'LandContour' not found in the DataFrame.")

In [344]:
# Select the numerical columns for correlation analysis
numerical_columns = train1.select_dtypes(include='number')

# Calculate the correlation matrix
correlation_matrix = numerical_columns.corr().astype(float)

# Create a mask for the upper triangular part of the matrix
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Apply the mask to the correlation matrix
masked_correlation_matrix = correlation_matrix.mask(mask)

# Create a trace for the heatmap
trace = go.Heatmap(
    z=masked_correlation_matrix.values,
    x=masked_correlation_matrix.columns,
    y=masked_correlation_matrix.index,
    colorscale='Spectral',
)

# Create the layout
layout = go.Layout(
    title="Correlation Matrix ",
    xaxis=dict(title="Variables"),
    yaxis=dict(title="Variables"),
)

# Create the figure
fig = go.Figure(data=[trace], layout=layout)

# Show the figure
fig.show()

In [345]:
# 5. Calculate Property AgeT
train1['PropertyAge'] = train['YrSold'] - train1['YearBuilt']

# Calculate Correlation between Property Age and Sale Price
age_price_corr = train1['PropertyAge'].corr(train1['SalePrice'])
print(f'Correlation between Property Age and Sale Price: {age_price_corr}\n \n ')

# Create a scatter plot to visualize the relationship between Property Age and Sale Price
fig9 = px.scatter(train1, x='PropertyAge', y='SalePrice', title='Property Age vs Sale Price', color='PropertyAge', color_continuous_scale=px.colors.sequential.Purp)

fig9.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))

fig9.show()

Correlation between Property Age and Sale Price: -0.523350417546816
 
 


In [346]:
# 5. Calculate Property remodAgeT
train1['remodage'] = train['YrSold'] - train1['YearRemodAdd']

# Calculate Correlation between Property Age and Sale Price
age_price_corr = train1['remodage'].corr(train1['SalePrice'])
print(f'Correlation between Property remodAge and Sale Price: {age_price_corr}\n\n ')

# Create a scatter plot to visualize the relationship between Property Age and Sale Price
fig9 = px.scatter(train1, x='remodage', y='SalePrice', title='Property Age vs Sale Price', color='remodage', color_continuous_scale=px.colors.sequential.Purp)

fig9.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))

fig9.show()

Correlation between Property remodAge and Sale Price: -0.5090787380156292

 


In [347]:
# 6. Calculate Correlation between Living Area and Sale Price
living_area_price_corr = train1['GrLivArea'].corr(train1['SalePrice'])
print(f'Correlation between Living Area (above grade) and Sale Price: {living_area_price_corr}')

# Create a scatter plot to visualize the relationship between Living Area and Sale Price
fig10 = px.scatter(train1, x='GrLivArea', y='SalePrice', title='Living Area (above grade) vs Sale Price', color='GrLivArea', color_continuous_scale=px.colors.sequential.Purp)

fig10.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))

fig10.show()

Correlation between Living Area (above grade) and Sale Price: 0.7086244776126523


In [348]:
OverAllQual_prices = train1.groupby('OverallQual')['SalePrice'].mean()
generate_bar_plot(OverAllQual_prices, 'OverallQual', 'SalePrice', 'Average Sale Price by OverAllQual')

In [349]:
# Count the occurrences of each category
count_data = train1['TotRmsAbvGrd'].value_counts()

# Create the count plot
fig = go.Figure(data=go.Bar(x=count_data.index, y=count_data.values,marker=dict(color=count_data.values, colorscale='aggrnyl')))

# Set the layout for the plot
fig.update_layout(
    title='Distribution of Number of Rooms',
    xaxis_title='Number of Rooms',
    yaxis_title='Count',
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Show the plot
fig.show()

In [350]:
TotRmsAbvGrd_prices = train1.groupby('TotRmsAbvGrd')['SalePrice'].mean()
generate_bar_plot(TotRmsAbvGrd_prices, 'TotRmsAbvGrd', 'SalePrice', 'Average Sale Price by Total rooms above grade')

In [351]:
import plotly.graph_objects as go

# Count the occurrences of each category
count_data = train['Utilities'].value_counts()

# Create the count plot
fig = go.Figure(data=go.Bar(x=count_data.index, y=count_data.values, marker=dict(color=count_data.values, colorscale='aggrnyl')))

# Set the layout for the plot
fig.update_layout(
    title='Distribution of Utilities',
    xaxis_title='Utilities',
    yaxis_title='Count',
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Show the plot
fig.show()


In [352]:
train1['TotalSF']=train1['1stFlrSF']+train1['TotalBsmtSF']+train1['2ndFlrSF'] 
# Calculate Correlation between Property Age and Sale Price
age_price_corr = train1['TotalSF'].corr(train1['SalePrice'])
print(f'Correlation between Property remodAge and Sale Price: {age_price_corr}\n\n')

# Create a scatter plot to visualize the relationship between Property Age and Sale Price
fig9 = px.scatter(train1, x='TotalSF', y='SalePrice', title='Property Age vs Sale Price', color='TotalSF', color_continuous_scale=px.colors.sequential.Purp)

fig9.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))

fig9.show()

Correlation between Property remodAge and Sale Price: 0.7822600527979846




## Preprocess data 

In [394]:
def perform_feature_engineering(data):
    # Convert data type of 'MSSubClass' to object
    data['MSSubClass'] = data['MSSubClass'].astype('O')
    
    # Create 'total_bath' feature
    #data['total_bath'] = data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath'] + data['FullBath'] + 0.5 * data['HalfBath']
    #data.drop(['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis=1, inplace=True)
    
    # Create 'total_bsmt' feature
    #data['total_bsmt'] = data['BsmtFinSF1'] + data['BsmtFinSF2'] + data['BsmtUnfSF']
    #data.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)
    
    # Create 'total_porch' feature
    #data['total_porch'] = data['EnclosedPorch'] + data['OpenPorchSF'] + data['3SsnPorch']
    #data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch'], axis=1, inplace=True)
    
    # Create 'TotalSF' feature
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
    data.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
    
    # Create 'houseage', 'garageage', and 'remodage' features
    data['PropertyAge'] = abs(data['YrSold'] - data['YearBuilt'])
    data['garageage'] = abs(data['YrSold'] - data['GarageYrBlt'])
    data['remodage'] = abs(data['YrSold'] - data['YearRemodAdd'])
    data.drop(['YearRemodAdd', 'YrSold', 'YearBuilt', 'GarageYrBlt'], axis=1, inplace=True)
    print("Feature engineering task is done\n\n")
    return data.head()
train2=train1.copy()
perform_feature_engineering(train2)

Feature engineering task is done




Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,PropertyAge,remodage,TotalSF,garageage
0,1,60,RL,65.0,8450,Pave,No Alley,Reg,Lvl,AllPub,...,No more Feat,0,2,WD,Normal,208500,5,5,2566,5.0
1,2,20,RL,80.0,9600,Pave,No Alley,Reg,Lvl,AllPub,...,No more Feat,0,5,WD,Normal,181500,31,31,2524,31.0
2,3,60,RL,68.0,11250,Pave,No Alley,IR1,Lvl,AllPub,...,No more Feat,0,9,WD,Normal,223500,7,6,2706,7.0
3,4,70,RL,60.0,9550,Pave,No Alley,IR1,Lvl,AllPub,...,No more Feat,0,2,WD,Abnorml,140000,91,36,2473,8.0
4,5,60,RL,84.0,14260,Pave,No Alley,IR1,Lvl,AllPub,...,No more Feat,0,12,WD,Normal,250000,8,8,3343,8.0


In [395]:
## Dealing with outliers 
c1 = (train2['SalePrice'] <= 200000) & (train2['TotalSF'] >= 6000)
c2 = (train2['SalePrice'] <= 200000) & (train2['GrLivArea'] >= 4000)

def filter_data(data):
    filtered_data = data[~c1 & ~c2].reset_index(drop=True)
    return filtered_data

train2 = filter_data(train2)
# Dropping 'Id' since it is not important in the analysis and this phase variable 'SalePrice'
train2.drop(['Id','Utilities','SalePrice'],axis=1,inplace=True)

In [396]:
# Get the list of column names as features for further processing
features = train2.columns.tolist()

# Separating categorical and numerical variables
cate_feat = [feat for feat in features if train2[feat].dtype == 'O']
nume_feat = [feat for feat in features if feat not in cate_feat]

# Printing the number of categorical and numerical features
print("Number of categorical features:", len(cate_feat))
print("Number of numerical features:", len(nume_feat))

# Defining nominal and ordinal features
nominal = ['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'MiscFeature',
           'BldgType', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'HouseStyle', 'MasVnrType',
           'Foundation', 'Heating', 'Electrical', 'GarageType', 'SaleType', 'SaleCondition']
ordinal = [feat for feat in cate_feat if feat not in nominal]

# Computing skewness of numerical features
skew_data = pd.DataFrame(train2[nume_feat].columns, columns=['Feature'])
skew_data['Skew'] = skew_data['Feature'].apply(lambda x: stats.skew(train2[x]))

# Checking if the features are skewed
skew_data['Skewed'] = skew_data['Skew'].apply(lambda x: True if np.abs(x) >= 0.5 else False)
skewed_columns = skew_data[skew_data['Skewed'] == True]['Feature']

# Applying logarithmic transformation to skewed features
train2[skewed_columns] = np.log1p(train2[skewed_columns])


Number of categorical features: 43
Number of numerical features: 32


In [397]:
# Perform one-hot encoding on categorical features in train2
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first')
new = ohe.fit_transform(train2[nominal])
dfOneHot = pd.DataFrame(new, columns=ohe.get_feature_names_out(nominal))

# Concatenate the encoded features with train2
train3 = pd.concat([train2, dfOneHot], axis=1)

# Drop the original categorical features from train3
train3 = train3.drop(nominal, axis=1)

## Encoding categorical features

In [398]:
def map_categorical_features(data):
    # Define mapping dictionaries
    bsm_dict ={'No Basement': 1, "Po": 1, "Fa": 1, "TA": 4, "Gd": 5, "Ex": 6}
    qual_dict = {'None': 1, "Po": 2, "Fa": 3, "TA":4, "Gd": 5, "Ex": 6}
    bsm_exp_dict={'No Basement':1,'No' :2, "Mn": 3, "Av": 4, "Gd": 5}
    pool_dict = {'No Pool': 1, "Po": 2, "Fa": 3, "TA": 4, "Gd": 5, "Ex": 6}
    Fire_dict = {'No Fire place': 1, "Po": 2, "Fa": 3, "TA": 4, "Gd": 5, "Ex": 6}
    Garage_dict = {'No Garage': 1, "Po":2, "Fa": 3, "TA": 4, "Gd": 5, "Ex": 6}
    bsmt_fin_dict = {'No Basement': 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
    land_contour_dict = {'Low':0,'HLS':1,'Bnk':2,'Lvl':3 }
    functional_dict = {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
    garage_finish_dict = {'No Garage': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
    fence_dict = {'No Fence': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
    central_air_dict = {'N': 0, 'Y': 1}
    paved_drive_dict = {'N': 0, 'P': 1, 'Y': 2}
    street_dict = {'Grvl': 0, 'Pave': 1}
    alley_dict = {'No Alley': 0, 'Grvl': 1, 'Pave': 2}
    land_slope_dict = {'Gtl': 0, 'Mod': 1, 'Sev': 2}
    lot_shape_dict = {'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3}
    
    # Perform mapping
    data["BsmtQual"] = data["BsmtQual"].map(bsm_dict)
    data["BsmtCond"] = data["BsmtCond"].map(bsm_dict)
    data["ExterQual"] = data["ExterQual"].map(qual_dict)
    data["ExterCond"] = data["ExterCond"].map(qual_dict)
    data["HeatingQC"] = data["HeatingQC"].map(qual_dict)
    data["KitchenQual"] = data["KitchenQual"].map(qual_dict)
    data["BsmtExposure"] = data["BsmtExposure"].map(bsm_exp_dict)
    data["PoolQC"] = data["PoolQC"].map(pool_dict)
    data["FireplaceQu"] = data["FireplaceQu"].map(Fire_dict)
    data["GarageQual"] = data["GarageQual"].map(Garage_dict)
    data["GarageCond"] = data["GarageCond"].map(Garage_dict)
    data["BsmtFinType1"] = data["BsmtFinType1"].map(bsmt_fin_dict)
    data["BsmtFinType2"] = data["BsmtFinType2"].map(bsmt_fin_dict)
    data["LandContour"] = data["LandContour"].map(land_contour_dict)
    data["Functional"] = data["Functional"].map(functional_dict)
    data["GarageFinish"] = data["GarageFinish"].map(garage_finish_dict)
    data["Fence"] = data["Fence"].map(fence_dict)
    data["CentralAir"] = data["CentralAir"].map(central_air_dict)
    data["PavedDrive"] = data["PavedDrive"].map(paved_drive_dict)
    data["Street"] = data["Street"].map(street_dict)
    data["Alley"] = data["Alley"].map(alley_dict)
    data["LandSlope"] = data["LandSlope"].map(land_slope_dict)
    data["LotShape"] = data["LotShape"].map(lot_shape_dict)

    remaining_categorical = data.select_dtypes(include=["object"]).columns.tolist()
    
    if len(remaining_categorical) == 0:
        print("Categorical feature mapping is done.")
    else:
        print("Error: Remaining categorical features:", remaining_categorical)



train4=train3.copy()
map_categorical_features(train4)


Categorical feature mapping is done.


## Target 

In [399]:
# Transform the target variable
target = train['SalePrice']
log_target = np.log(target)

# Eliminate outliers from log_target
log_target = log_target[~c1]
log_target = log_target[~c2]
log_target.reset_index(drop=True, inplace=True)

## Test Set

In [400]:

# Get missing values and calculate percentages for test dataset
missing = test.isna().sum()[test.isna().sum() > 0].sort_values(ascending=False)
df_missing = pd.DataFrame(missing, columns=['Missing'])
df_missing['Percent'] = df_missing['Missing'] * 100 / test.shape[0]

# Display scrollable table for missing values in test dataset
table_id = 'missing-table'
title = 'Missing Values (Test Dataset)'
html = create_scrollable_table(df_missing, table_id, title)
display(HTML(html))

Unnamed: 0,Missing,Percent
PoolQC,1456,99.79438
MiscFeature,1408,96.504455
Alley,1352,92.66621
Fence,1169,80.123372
FireplaceQu,730,50.03427
LotFrontage,227,15.558602
GarageCond,78,5.346127
GarageYrBlt,78,5.346127
GarageQual,78,5.346127
GarageFinish,78,5.346127


In [401]:
test1=test.copy()
test1.drop(['Id','Utilities'],axis=1,inplace=True)
handle_missing_values(test1)
check_missing_values(test1)

Missing values in the dataset:
MSZoning        4
Exterior1st     1
Exterior2nd     1
BsmtFinSF1      1
BsmtFinSF2      1
BsmtUnfSF       1
TotalBsmtSF     1
BsmtFullBath    2
BsmtHalfBath    2
KitchenQual     1
Functional      2
GarageCars      1
GarageArea      1
SaleType        1
dtype: int64


In [402]:
cond1 = test1['BsmtFinType1'] == 'No Basement'
test1['BsmtFinSF1'] = test1['BsmtFinSF1'].fillna(cond1.map({True: 0, False: test1['BsmtFinSF1'].mean()}))

cond2 = test1['BsmtFinType2'] == 'No Basement'
test1['BsmtFinSF2'] = test1['BsmtFinSF2'].fillna(cond2.map({True: 0, False: test1['BsmtFinSF2'].mean()}))

cond3 = test1['BsmtCond'] == 'No Basement'
test1['BsmtUnfSF'].fillna(cond3.map({True: 0, False: test1['BsmtUnfSF'].mean()}), inplace=True)

for col in ['BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
    test1[col].fillna(0, inplace=True)

test1['TotalBsmtSF'].fillna(test1['BsmtFinSF1'] + test1['BsmtFinSF2'] + test1['BsmtUnfSF'], inplace=True)

cat_new = ['MSZoning', 'Functional', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType']
for col in cat_new:
    test1[col].fillna(test1[col].mode()[0], inplace=True)

check_missing_values(test1)

No missing values in the dataset.


## feat eng for test

In [403]:
test2=test1.copy()
perform_feature_engineering(test2)

Feature engineering task is done




Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,TotalSF,PropertyAge,garageage,remodage
0,20,RH,80.0,11622,Pave,No Alley,Reg,Lvl,Inside,Gtl,...,MnPrv,No more Feat,0,6,WD,Normal,1778.0,49,49.0,49
1,20,RL,81.0,14267,Pave,No Alley,IR1,Lvl,Corner,Gtl,...,No Fence,Gar2,12500,6,WD,Normal,2658.0,52,52.0,52
2,60,RL,74.0,13830,Pave,No Alley,IR1,Lvl,Inside,Gtl,...,MnPrv,No more Feat,0,3,WD,Normal,2557.0,13,13.0,12
3,60,RL,78.0,9978,Pave,No Alley,IR1,Lvl,Inside,Gtl,...,No Fence,No more Feat,0,6,WD,Normal,2530.0,12,12.0,12
4,120,RL,43.0,5005,Pave,No Alley,IR1,HLS,Inside,Gtl,...,No Fence,No more Feat,0,1,WD,Normal,2560.0,18,18.0,18


In [404]:
test2[skewed_columns] = np.log1p(test2[skewed_columns])

In [405]:
# Transform categorical features in test2 using the fitted encoder
new_ht = ohe.transform(test2[nominal])

# Create a DataFrame with the encoded features
dfOneHot = pd.DataFrame(new_ht, columns=ohe.get_feature_names_out(nominal))

# Concatenate the encoded features with test2
test3 = pd.concat([test2, dfOneHot], axis=1)

# Drop the original categorical features from test3
test3 = test3.drop(nominal, axis=1)

In [406]:
test4=test3.copy()
map_categorical_features(test4)

Categorical feature mapping is done.


## feat scaling 

In [407]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Perform feature scaling on the training data
X_train_scaled = scaler.fit_transform(train4[nume_feat])

# Perform feature scaling on the test data
X_test_scaled = scaler.transform(test4[nume_feat])

# Update the scaled features in the train and test datasets
train4[nume_feat] = X_train_scaled
test4[nume_feat] = X_test_scaled


## base model

In [408]:
# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor()

# Fit the model to the training data
gbr.fit(train4, log_target)

# Create a K-fold cross-validation object
kfold = KFold(n_splits=5, shuffle=True)

# Perform cross-validation on the Gradient Boosting Regressor
results = cross_val_score(gbr, train4, log_target, scoring='neg_mean_squared_error', cv=kfold)

# Calculate the RMSE
rmse = np.sqrt(-results.mean())
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.12568010823612827


## bagging 

In [409]:

# Install pycaret if needed
!pip install --pre pycaret
from pycaret.regression import compare_models, setup

# Setup the data and target variable
data = pd.concat([train4, log_target], axis=1)
target = 'SalePrice'
setup(data=data, target=target)

# Compare regression models
compare_models()


[0m

Unnamed: 0,Description,Value
0,Session id,5377
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1458, 209)"
4,Transformed data shape,"(1458, 209)"
5,Transformed train set shape,"(1020, 209)"
6,Transformed test set shape,"(438, 209)"
7,Numeric features,208
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.0837,0.0139,0.1176,0.9159,0.0092,0.007,0.801
catboost,CatBoost Regressor,0.0819,0.014,0.1179,0.9149,0.0092,0.0069,4.262
ridge,Ridge Regression,0.085,0.0144,0.1196,0.9125,0.0093,0.0071,0.497
gbr,Gradient Boosting Regressor,0.0883,0.0158,0.1252,0.9048,0.0098,0.0074,1.208
lightgbm,Light Gradient Boosting Machine,0.0912,0.0165,0.128,0.9009,0.01,0.0076,1.387
omp,Orthogonal Matching Pursuit,0.0958,0.0179,0.1331,0.8928,0.0104,0.008,0.452
et,Extra Trees Regressor,0.0954,0.0191,0.1371,0.8841,0.0107,0.008,2.565
rf,Random Forest Regressor,0.0978,0.0197,0.1396,0.8822,0.0109,0.0082,2.539
xgboost,Extreme Gradient Boosting,0.0979,0.0202,0.1415,0.8774,0.011,0.0082,1.778
ada,AdaBoost Regressor,0.1306,0.0279,0.1667,0.8313,0.0129,0.0109,0.869


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [411]:

# Initialize the top 5 regression models
gbr = GradientBoostingRegressor()
br = BayesianRidge()
lgbm = LGBMRegressor()
cb = CatBoostRegressor(verbose=0)
omp = OrthogonalMatchingPursuit()
ridge = Ridge()

# Initialize the list to store results
results = []

# Loop through each model and calculate RMSE
for model in [cb, br, gbr, lgbm, ridge]:
    model.fit(train4, log_target)
    result = cross_val_score(model, train4, log_target, scoring='neg_mean_squared_error', cv=5)
    rmse = np.sqrt(np.mean(-result))
    results.append(rmse)

# Calculate the mean RMSE
mean_rmse = np.mean(results)

mean_rmse


0.12033585530031125

In [428]:
train_features = train4.columns.tolist()
test4_reordered = test4[train_features]
# Calculate the final predictions using a weighted average of individual model predictions
final_pred = ( 
    0.4* np.exp(br.predict( test4_reordered))+
    0.2* np.exp(ridge.predict(test4_reordered)) +
    0.3* np.exp(cb.predict( test4_reordered))+
    0.1* np.exp(gbr.predict( test4_reordered))
    #0.05* np.exp(lgbm.predict( test4_reordered))
)

# Print the first 25 predictions
final_pred[:25]

array([118772.94306852, 160888.93634984, 183357.5919767 , 195637.29513996,
       196158.42042097, 169676.41536161, 174219.25727964, 159644.56063648,
       189926.00408785, 121514.6072523 , 193390.06482499,  97193.52219799,
        95872.12250594, 147699.68716421, 111824.30543529, 357793.05104355,
       251022.94693199, 288245.00400907, 286544.12573075, 470925.96748735,
       330890.46773459, 213568.37814993, 177318.61579157, 166279.99642895,
       188487.88573846])

In [429]:
# Create a DataFrame for submission
submission = pd.DataFrame()
submission['Id'] = test['Id']
submission['SalePrice'] = final_pred

# Save the submission DataFrame as a CSV file
submission.to_csv('./submission.csv', index=False)