In [1]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats
from IPython.display import display, HTML


In [2]:
df= pd.read_csv("C:/Users/anura/OneDrive/Desktop/ME/DataProjects/HousePricePrediction/house-prices-advanced-regression-techniques/train.csv")

In [3]:
def scrollable_table(df,table_id,title):
    html=f'<h3>{title}</h3>'
    html+=f'<div id="{table_id}" style="height:200px;overflow:auto;">'
    html+=df.to_html()
    html+='</div>'
    return html

In [4]:
df.shape

(1460, 81)

In [5]:
numerical_features=df.select_dtypes(include=[np.number])
summmary_stats=numerical_features.describe().T
html_numerical=scrollable_table(summmary_stats,'numerical_features','summmary statistic of numeric features')
display(HTML(html_numerical))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [6]:
categorical_features=df.select_dtypes(include=[object])
cat_summmary_stats=categorical_features.describe().T
html_numerical=scrollable_table(cat_summmary_stats,'categorical_features','summmary statistic of categorical features')
display(HTML(html_numerical))

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [7]:
null_values=df.isnull().sum()
html_null=scrollable_table(null_values.to_frame(),'null_values','Null values in the dfset')
display(HTML(html_null))

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,259
LotArea,0
Street,0
Alley,1369
LotShape,0
LandContour,0
Utilities,0


In [8]:
import scipy.stats as stats

# Fit a normal distribution to the SalePrice data
mu, sigma = stats.norm.fit(df['SalePrice'])

# Create a histogram of the SalePrice column
hist_data = go.Histogram(x=df['SalePrice'], nbinsx=50, name="Histogram", opacity=1, histnorm='probability density', marker=dict(color='purple'))

# Calculate the normal distribution based on the fitted parameters
x_norm = np.linspace(df['SalePrice'].min(), df['SalePrice'].max(), 100)
y_norm = stats.norm.pdf(x_norm, mu, sigma)

# Create the normal distribution overlay
norm_data = go.Scatter(x=x_norm, y=y_norm, mode="lines", name=f"Normal dist. (μ={mu:.2f}, σ={sigma:.2f})", line=dict(color="green"))

# Combine the histogram and the overlay
fig = go.Figure(data=[hist_data, norm_data])

# Set the layout for the plot
fig.update_layout(
    title="SalePrice Distribution",
    xaxis_title="SalePrice",
    yaxis_title="Density",
    legend_title_text="Fitted Normal Distribution",
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Create a Q-Q plot
qq_data = stats.probplot(df['SalePrice'], dist="norm")
qq_fig = px.scatter(x=qq_data[0][0], y=qq_data[0][1], labels={'x': 'Theoretical Quantiles', 'y': 'Ordered Values'}, color_discrete_sequence=["purple"])
qq_fig.update_layout(
    title="Q-Q plot",
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Calculate the line of best fit
slope, intercept, r_value, p_value, std_err = stats.linregress(qq_data[0][0], qq_data[0][1])
line_x = np.array(qq_data[0][0])
line_y = intercept + slope * line_x

# Add the line of best fit to the Q-Q plot
line_data = go.Scatter(x=line_x, y=line_y, mode="lines", name="Normal Line", line=dict(color="green"))

# Update the Q-Q plot with the normal line
qq_fig.add_trace(line_data)

# Show the plots
fig.show()
qq_fig.show()

In [9]:
#import plotly.graph_objects as go

# Count the number of each dwelling type
dwelling_counts = df['BldgType'].value_counts()
dwelling_prices=df.groupby('BldgType')['SalePrice'].mean()

# Format labels for the second graph
formatted_dwelling_prices = ['$' + f'{value:,.2f}' for value in dwelling_prices.values]

# Create a bar plot for dwelling type counts
bar_fig = go.Figure(go.Bar(
    x=dwelling_counts.index,
    y=dwelling_counts.values,
    marker_color='purple'
))
bar_fig.update_layout(
    title="Distribution of Dwelling Types",
    xaxis_title="Dwelling Type",
    yaxis_title="Count",
    plot_bgcolor='rgba(32, 32, 32, 1)',
    paper_bgcolor='rgba(32, 32, 32, 1)',
    font=dict(color='white')
)

# Create a box plot to compare sale prices across different dwelling types
fig2 = go.Figure(data=[go.Bar(
    x=dwelling_prices.index,
    y=dwelling_prices.values,
    marker_color='rgb(156, 39, 176)',
    text=formatted_dwelling_prices,
    textposition='outside',
    width=0.4,
    marker=dict(line=dict(width=2, color='rgba(0,0,0,1)'), opacity=1)
)])
fig2.update_layout(
    title='Average Sale Price by Building Type',
    xaxis_title='Building Type',
    yaxis_title='Price',
    plot_bgcolor='rgba(34, 34, 34, 1)',
    paper_bgcolor='rgba(34, 34, 34, 1)',
    font=dict(color='white')
)

# Show the figures
bar_fig.show()
fig2.show()


1fam building type go for higher prices
The Extra increase in the price of twnhsE building type compared to twnhs shows the End units have higher demand.

In [10]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame containing the dataset

# Group the data by zoning classifications and calculate the average sale price
street_prices = df.groupby('MSZoning')['SalePrice'].mean().reset_index()

# Plot a bar chart
fig = px.bar(street_prices, x='MSZoning', y='SalePrice', color='MSZoning',
             labels={'SalePrice': 'Average Sale Price', 'MSZoning': 'Zoning Classification'},
             title='Average Sale Price by Zoning Classification')
fig.update_layout(
    xaxis_title='Zoning Classification',
    yaxis_title='Average Sale Price',
    plot_bgcolor='rgba(255, 255, 255, 0.9)',  # Change plot background color
    font=dict(color='black')  # Change font color
)

# Add labels of prices above each bar
for i in range(len(street_prices)):
    fig.add_annotation(
        x=street_prices['MSZoning'][i],
        y=street_prices['SalePrice'][i],
        text=f"${street_prices['SalePrice'][i]:,.2f}",
        showarrow=False,
        font=dict(color='black', size=10),
        yshift=5  # Adjust vertical position of the label
    )

fig.show()


FV-Floating Visual Residential and RL-Residential Low density are of highest value

In [11]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame containing the dataset

# Group the data by street access types and calculate the average sale price
avg_sale_price_by_street = df.groupby('Street')['SalePrice'].mean().reset_index()

# Plot a bar chart for street access
fig_street = px.bar(avg_sale_price_by_street, x='Street', y='SalePrice', color='Street',
                    labels={'SalePrice': 'Average Sale Price', 'Street': 'Street Access Type'},
                    title='Average Sale Price by Street Access Type')
fig_street.update_layout(
    xaxis_title='Street Access Type',
    yaxis_title='Average Sale Price',
    plot_bgcolor='rgba(255, 255, 255, 0.9)',  # Change plot background color
    font=dict(color='black')  # Change font color
)

# Add labels of prices above each bar for street access
for i in range(len(avg_sale_price_by_street)):
    fig_street.add_annotation(
        x=avg_sale_price_by_street['Street'][i],
        y=avg_sale_price_by_street['SalePrice'][i],
        text=f"${avg_sale_price_by_street['SalePrice'][i]:,.2f}",
        showarrow=False,
        font=dict(color='black', size=10),
        yshift=5  # Adjust vertical position of the label
    )

# Group the data by alley access types and calculate the average sale price
avg_sale_price_by_alley = df.groupby('Alley')['SalePrice'].mean().reset_index()

# Plot a bar chart for alley access
fig_alley = px.bar(avg_sale_price_by_alley, x='Alley', y='SalePrice', color='Alley',
                   labels={'SalePrice': 'Average Sale Price', 'Alley': 'Alley Access Type'},
                   title='Average Sale Price by Alley Access Type')
fig_alley.update_layout(
    xaxis_title='Alley Access Type',
    yaxis_title='Average Sale Price',
    plot_bgcolor='rgba(255, 255, 255, 0.9)',  # Change plot background color
    font=dict(color='black')  # Change font color
)

# Add labels of prices above each bar for alley access
for i in range(len(avg_sale_price_by_alley)):
    fig_alley.add_annotation(
        x=avg_sale_price_by_alley['Alley'][i],
        y=avg_sale_price_by_alley['SalePrice'][i],
        text=f"${avg_sale_price_by_alley['SalePrice'][i]:,.2f}",
        showarrow=False,
        font=dict(color='black', size=10),
        yshift=5  # Adjust vertical position of the label
    )

fig_street.show()
fig_alley.show()



Properties that have access to paved road have higher value 

In [12]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame containing the dataset

# Group the data by property shapes and calculate the average sale price
avg_sale_price_by_shape = df.groupby('LotShape')['SalePrice'].mean().reset_index()

# Plot a bar chart
fig = px.bar(avg_sale_price_by_shape, x='LotShape', y='SalePrice',
             labels={'SalePrice': 'Average Sale Price', 'LotShape': 'Property Shape'},
             title='Average Sale Price by Property Shape')
fig.update_layout(
    xaxis_title='Property Shape',
    yaxis_title='Average Sale Price',
    plot_bgcolor='rgba(255, 255, 255, 0.9)',  # Change plot background color
    font=dict(color='black')  # Change font color
)

# Add labels of prices above each bar
for i in range(len(avg_sale_price_by_shape)):
    fig.add_annotation(
        x=avg_sale_price_by_shape['LotShape'][i],
        y=avg_sale_price_by_shape['SalePrice'][i],
        text=f"${avg_sale_price_by_shape['SalePrice'][i]:,.2f}",
        showarrow=False,
        font=dict(color='black', size=10),
        yshift=5  # Adjust vertical position of the label
    )

fig.show()


IR2 and IR3 ie moderately irregular and irregular property have higher average price respextively.

In [13]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame containing the dataset

# Group the data by property contours and calculate the average sale price
avg_sale_price_by_contour = df.groupby('LandContour')['SalePrice'].mean().reset_index()

# Plot a bar chart
fig = px.bar(avg_sale_price_by_contour, x='LandContour', y='SalePrice',
             labels={'SalePrice': 'Average Sale Price', 'LandContour': 'Property Contour'},
             title='Average Sale Price by Property Contour')
fig.update_layout(
    xaxis_title='Property Contour',
    yaxis_title='Average Sale Price',
    plot_bgcolor='rgba(255, 255, 255, 0.9)',  # Change plot background color
    font=dict(color='black')  # Change font color
)

# Add labels of prices above each bar
for i in range(len(avg_sale_price_by_contour)):
    fig.add_annotation(
        x=avg_sale_price_by_contour['LandContour'][i],
        y=avg_sale_price_by_contour['SalePrice'][i],
        text=f"${avg_sale_price_by_contour['SalePrice'][i]:,.2f}",
        showarrow=False,
        font=dict(color='black', size=10),
        yshift=5  # Adjust vertical position of the label
    )

fig.show()


HLS- Hillside property have higher value, coz of the scenic view, elvation supports many factors such as away from insects and other worms. Its safer too

In [14]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame containing the dataset

# Calculate the age of the properties
df['PropertyAge'] = df['YrSold'] - df['YearBuilt']

# Create a scatter plot
fig = px.scatter(df, x='PropertyAge', y='SalePrice', trendline="ols",
                 labels={'SalePrice': 'Sale Price', 'PropertyAge': 'Property Age'},
                 title='Effect of Property Age on Sale Price',color='PropertyAge',color_continuous_scale=px.colors.sequential.Purp)


fig.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))
fig.show()


In [15]:
yearly_avg_sale_price = df.groupby('YrSold')['SalePrice'].mean()

fig13 = px.box(df, x='YrSold', y='SalePrice', title='Sale Price Trends Over the Years',
               points=False, color_discrete_sequence=['green'])

fig13.add_trace(px.line(x=yearly_avg_sale_price.index, y=yearly_avg_sale_price.values).data[0])

fig13.update_traces(line=dict(color='purple', width=4), selector=dict(type='scatter', mode='lines'))

for year, avg_price in yearly_avg_sale_price.items():
    fig13.add_annotation(
        x=year,
        y=avg_price,
        text=f"{avg_price:,.0f}",
        font=dict(color='white'),
        showarrow=False,
        bgcolor='rgba(128, 0, 128, 0.6)'
    )

fig13.update_layout(
    plot_bgcolor='rgb(30,30,30)',
    paper_bgcolor='rgb(30,30,30)',
    font=dict(color='white'),
    xaxis_title='Year Sold',
    yaxis_title='Sale Price'
)

fig13.show()


In [16]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame containing the dataset

# Calculate the growth of sale prices over the years
df['YearSold'] = pd.to_datetime(df['YrSold'], format='%Y')
df['SalePriceGrowth'] = df.groupby('YearSold')['SalePrice'].transform('mean')

# Create a box plot
fig = px.box(df, x='YearSold', y='SalePriceGrowth',
             labels={'SalePriceGrowth': 'Average Sale Price', 'YearSold': 'Year Sold'},
             title='Growth of Sale Price Over Years',
             template='plotly_dark')

# Customize the plot
fig.update_layout(
    xaxis_title='Year Sold',
    yaxis_title='Average Sale Price',
    plot_bgcolor='rgba(0, 0, 0, 0)',  # Change plot background color to transparent
    font=dict(color='white'),  # Change font color to white
    showlegend=False  # Hide legend
)
fig.show()


We Observe that in the years 2006 and 2007 there was a steady increase in the price of lands. But there's a sudden decline in the price in 2008. This is due to the the 2008 financial crisis.The crisis caused property prices to drop, delayed projects, increased regulation, and changed buyer behavior which affected the price of the land but in the further years prices pick up again.

In [17]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Assuming df is your DataFrame containing the dataset

# Numerical attributes correlation with SalePrice
numerical_corr = df.select_dtypes(include=['int64', 'float64']).corr()['SalePrice'].sort_values(ascending=False)

# Categorical attributes correlation with SalePrice
categorical_corr = {}

for column in df.select_dtypes(include=['object']):
    if df[column].nunique() == 2:  # Binary variable
        encoded_column = pd.get_dummies(df[column], drop_first=True)
        for category in encoded_column.columns:
            correlation = np.corrcoef(encoded_column[category], df['SalePrice'])[0, 1]
            categorical_corr[f'{column}_{category}'] = correlation
    else:
        # Use ANOVA for other categorical variables
        encoded_column = pd.get_dummies(df[column], drop_first=True)
        for category in encoded_column.columns:
            f_statistic, p_value = stats.f_oneway(df[df[column] == category]['SalePrice'], 
                                                   df[df[column] != category]['SalePrice'])
            correlation = f_statistic
            categorical_corr[f'{column}_{category}'] = correlation

# Plotting heatmap for numerical attributes correlation
fig_numerical = go.Figure(data=go.Heatmap(
    z=df[numerical_corr.index].corr().values,
    x=numerical_corr.index,
    y=numerical_corr.index,
    colorscale='Viridis',
    reversescale=True
))
fig_numerical.update_layout(
    title='Correlation Heatmap of Numerical Attributes with SalePrice',
    xaxis_title='Numerical Attributes',
    yaxis_title='Numerical Attributes',
    template='plotly_dark'
)

# Plotting bar chart for categorical attributes correlation
fig_categorical = go.Figure(data=go.Bar(
    x=list(categorical_corr.keys()),
    y=list(categorical_corr.values()),
    marker_color='indigo'
))
fig_categorical.update_layout(
    title='Correlation of Categorical Attributes with SalePrice',
    xaxis_title='Categorical Attributes',
    yaxis_title='Correlation Coefficient',
    template='plotly_dark'
)

fig_numerical.show()
fig_categorical.show()



In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector


#creating pipeline to execute a series of actions at once
# Define numerical and categorical transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, selector(dtype_exclude=['object', 'datetime64[ns]'])),
        ('cat', categorical_transformer, selector(dtype_include=['object']))
    ])

# Define the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Split the dataset into features and target variable
X = df.drop('SalePrice', axis=1)
y = np.log(df['SalePrice'])  # Normalize dependent variable

# Apply the pipeline to the dataset
X_preprocessed = pipeline.fit_transform(X)



In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV,KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Split the preprocessed data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Define a list of models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Define parameter grids for each model
param_grid = {

    'Linear Regression': {},
    'Decision Tree': {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]},
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Train and tune models
best_models = {}
for name, model in models.items():
    model_file = f"DumpFiles/{name}_model.pkl"
    try:
        # Try to load the model from file
        best_models[name] = joblib.load(model_file)
        print(f"Loaded {name} model from {model_file}")
    except FileNotFoundError:
        print(f"Training and tuning {name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid[name], scoring='neg_mean_squared_error', cv=cv)
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
        joblib.dump(best_models[name], model_file)
        print(f"Saved {name} model to {model_file}")
        print(f"Best parameters for {name}: {grid_search.best_params_}")

# Evaluate models
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred,squared=False)
    print(f"{name} RMSE: {rmse}")

# Identify the best model
best_model_name = min(best_models, key=lambda x: mean_squared_error(y_test, best_models[x].predict(X_test)))
print(f"\nBest model: {best_model_name}")


Loaded Linear Regression model from Linear Regression_model.pkl
Loaded Decision Tree model from Decision Tree_model.pkl
Loaded Random Forest model from Random Forest_model.pkl
Loaded XGBoost model from XGBoost_model.pkl
Linear Regression RMSE: 977885460.1489394
Decision Tree RMSE: 0.18968072522689552
Random Forest RMSE: 0.14657478518065975
XGBoost RMSE: 0.13502976067795205

Best model: XGBoost


In [20]:
from sklearn.neural_network import MLPRegressor

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Create an MLPRegressor instance
mlp = MLPRegressor(random_state=42,max_iter=10000, n_iter_no_change = 3,learning_rate_init=0.001)

# Define the parameter grid for tuning
param_grid = {
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (25)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

# Create the GridSearchCV object
grid_search_mlp = GridSearchCV(mlp, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search_mlp.fit(X_train_scaled, y_train)

# Print the best parameters found during the search
print("Best parameters found: ", grid_search_mlp.best_params_)

# Evaluate the model on the test data
best_score = np.sqrt(-1 * grid_search_mlp.best_score_)
print("Test score: ", best_score)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters found:  {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'solver': 'adam'}
Test score:  0.24241414899707112


In [21]:
#pca
from sklearn.decomposition import PCA

pca = PCA()
X_pca_pre = pca.fit_transform(X_preprocessed)

# Calculate the cumulative explained variance
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)

# Choose the number of components based on the explained variance threshold
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

pca = PCA(n_components=n_components)
pipeline_pca = Pipeline(steps=
                        [('preprocessor', preprocessor),
                        ('pca', pca)])

X_pca = pipeline_pca.fit_transform(X)

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
import joblib

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Define the hyperparameter grids for each model
param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 5, 10],
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Train and tune the models
grids_pca = {}
for model_name, model in models.items():
    model_file = f"DumpFiles/{model_name}_model2.pkl"
    try:
        # Try to load the model from file
        grids_pca[model_name] = joblib.load(model_file)
        print(f"Loaded {model_name} model from {model_file}")
    except FileNotFoundError:
        print(f"Training {model_name} model...")
        grids_pca[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
        grids_pca[model_name].fit(X_train_pca, y_train_pca)
        joblib.dump(grids_pca[model_name], model_file)
        print(f"Saved {model_name} model to {model_file}")
    
    # Optionally print best parameters and score
    best_params = grids_pca[model_name].best_params_
    best_score = np.sqrt(-1 * grids_pca[model_name].best_score_)
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')



Loaded LinearRegression model from LinearRegression_model2.pkl
Best parameters for LinearRegression: {}
Best RMSE for LinearRegression: 0.15144235216085575

Loaded RandomForest model from RandomForest_model2.pkl
Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best RMSE for RandomForest: 0.15020849517699

Loaded XGBoost model from XGBoost_model2.pkl
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best RMSE for XGBoost: 0.14151049982039104



In [23]:
from sklearn.neural_network import MLPRegressor

X_train_scaled_pca = X_train_pca.copy()
X_test_scaled_pca = X_test_pca.copy()

# Create an MLPRegressor instance
mlp = MLPRegressor(random_state=42,max_iter=10000, n_iter_no_change = 3,learning_rate_init=0.001)

# Define the parameter grid for tuning
param_grid = {
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (25)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01, .1, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

# Create the GridSearchCV object
grid_search_mlp_pca = GridSearchCV(mlp, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search_mlp_pca.fit(X_train_scaled_pca, y_train)

# Print the best parameters found during the search
print("Best parameters found: ", grid_search_mlp_pca.best_params_)

# Evaluate the model on the test data
best_score = np.sqrt(-1 * grid_search_mlp_pca.best_score_)
print("Test score: ", best_score)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
Best parameters found:  {'activation': 'tanh', 'alpha': 1, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'solver': 'adam'}
Test score:  0.21584523847761983


In [24]:
from sklearn.metrics import mean_squared_error
for i in best_models.keys():
    print (i + ': ' + str(np.sqrt(mean_squared_error(best_models[i].predict(X_test), y_test))))

Linear Regression: 977885460.1489394
Decision Tree: 0.18968072522689552
Random Forest: 0.14657478518065975
XGBoost: 0.13502976067795205


In [25]:
from sklearn.metrics import mean_squared_error
for i in grids_pca.keys():
    print (i + ': ' + str(np.sqrt(mean_squared_error(grids_pca[i].predict(X_test_pca), y_test))))

LinearRegression: 0.14847144040978347
RandomForest: 0.1507648946197434
XGBoost: 0.14378253792655776


In [26]:
print( str(np.sqrt(mean_squared_error(grid_search_mlp.predict(X_test_scaled),y_test))))

0.178330928202835


In [27]:
print( str(np.sqrt(mean_squared_error(grid_search_mlp_pca.predict(X_test_scaled_pca),y_test))))

0.18115041346875693


In [28]:
var_explore = df[['Fence','Alley','MiscFeature','PoolQC','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond','BsmtQual','MasVnrType','Electrical','MSZoning','Utilities','Exterior1st','Exterior2nd','KitchenQual','Functional','SaleType','LotFrontage','GarageYrBlt','MasVnrArea','BsmtFullBath','BsmtHalfBath','GarageCars','GarageArea','TotalBsmtSF']]

display(HTML(scrollable_table(var_explore, 'var_explore', 'List of Variables to Explore for Feature Engineering')))

Unnamed: 0,Fence,Alley,MiscFeature,PoolQC,FireplaceQu,GarageCond,GarageQual,GarageFinish,GarageType,BsmtExposure,BsmtFinType2,BsmtFinType1,BsmtCond,BsmtQual,MasVnrType,Electrical,MSZoning,Utilities,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,LotFrontage,GarageYrBlt,MasVnrArea,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,TotalBsmtSF
0,,,,,,TA,TA,RFn,Attchd,No,Unf,GLQ,TA,Gd,BrkFace,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,65.0,2003.0,196.0,1,0,2,548,856
1,,,,,TA,TA,TA,RFn,Attchd,Gd,Unf,ALQ,TA,Gd,,SBrkr,RL,AllPub,MetalSd,MetalSd,TA,Typ,WD,80.0,1976.0,0.0,0,1,2,460,1262
2,,,,,TA,TA,TA,RFn,Attchd,Mn,Unf,GLQ,TA,Gd,BrkFace,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,68.0,2001.0,162.0,1,0,2,608,920
3,,,,,Gd,TA,TA,Unf,Detchd,No,Unf,ALQ,Gd,TA,,SBrkr,RL,AllPub,Wd Sdng,Wd Shng,Gd,Typ,WD,60.0,1998.0,0.0,1,0,3,642,756
4,,,,,TA,TA,TA,RFn,Attchd,Av,Unf,GLQ,TA,Gd,BrkFace,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,84.0,2000.0,350.0,1,0,3,836,1145
5,MnPrv,,Shed,,,TA,TA,Unf,Attchd,No,Unf,GLQ,TA,Gd,,SBrkr,RL,AllPub,VinylSd,VinylSd,TA,Typ,WD,85.0,1993.0,0.0,1,0,2,480,796
6,,,,,Gd,TA,TA,RFn,Attchd,Av,Unf,GLQ,TA,Ex,Stone,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,75.0,2004.0,186.0,1,0,2,636,1686
7,,,Shed,,TA,TA,TA,RFn,Attchd,Mn,BLQ,ALQ,TA,Gd,Stone,SBrkr,RL,AllPub,HdBoard,HdBoard,TA,Typ,WD,,1973.0,240.0,1,0,2,484,1107
8,,,,,TA,TA,Fa,Unf,Detchd,No,Unf,Unf,TA,TA,,FuseF,RM,AllPub,BrkFace,Wd Shng,TA,Min1,WD,51.0,1931.0,0.0,0,0,2,468,952
9,,,,,TA,TA,Gd,RFn,Attchd,No,Unf,GLQ,TA,TA,,SBrkr,RL,AllPub,MetalSd,MetalSd,TA,Typ,WD,50.0,1939.0,0.0,1,0,1,205,991


In [29]:
from sklearn.preprocessing import FunctionTransformer

# feature engineering functions 
def custom_features(df):
    df_out = df.copy()
    df_out['PropertyAge'] = df_out['YrSold'] - df_out['YearBuilt']
    df_out['TotalSF'] = df_out['TotalBsmtSF'] + df_out['1stFlrSF'] + df_out['2ndFlrSF']
    df_out['TotalBath'] = df_out['FullBath'] + 0.5 * df_out['HalfBath'] + df_out['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    df_out['HasRemodeled'] = (df_out['YearRemodAdd'] != df_out['YearBuilt']).astype(object)
    df_out['Has2ndFloor'] = (df_out['2ndFlrSF'] > 0).astype(object)
    df_out['HasGarage'] = (df_out['GarageArea'] > 0).astype(object)
    df_out['YrSold_cat'] = df_out['YrSold'].astype(object)
    df_out['MoSold_cat'] = df_out['MoSold'].astype(object)
    df_out['YearBuilt_cat'] = df_out['YearBuilt'].astype(object)
    df_out['MSSubClass_cat'] = df_out['MSSubClass'].astype(object)
    
    return df_out

feature_engineering_transformer = FunctionTransformer(custom_features)




In [55]:
# Identify categorical and numerical columns
new_cols_categorical = pd.Index(['HasRemodeled', 'Has2ndFloor', 'HasGarage'])
new_cols_numeric = pd.Index(['PropertyAge', 'TotalSF', 'TotalBath', 'YrSold_cat', 'MoSold_cat', 'YearBuilt_cat', 'MSSubClass_cat'])

# Update categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.append(new_cols_categorical)
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.append(new_cols_numeric)

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('SalePrice')
numerical_columns=numerical_columns.drop('SalePriceGrowth')
#creating pipeline to execute a series of actions at once
# Define numerical and categorical transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns ),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create a pipeline with the preprocessor
pipeline_fe = Pipeline(steps=[
    ('fe', feature_engineering_transformer),
    ('preprocessor', preprocessor),
    ('pca', pca)])

# Apply the pipeline to your dataset
X = df.drop('SalePrice', axis=1)
y = np.log(df['SalePrice'])
X_preprocessed_fe = pipeline_fe.fit_transform(X)


In [56]:
# Split the data into training and testing sets
import joblib
from sklearn.model_selection import train_test_split
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Define the hyperparameter grids for each model
param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 5, 10],
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)



# Train and tune the models
grids_fe = {}
for model_name, model in models.items():
    model_file = f"DumpFiles/{model_name}_fe_model.pkl"
    try:
        # Try to load the model from file
        grids_fe[model_name] = joblib.load(model_file)
        print(f"Loaded {model_name} model from {model_file}")
    except FileNotFoundError:
        print(f"Training and tuning {model_name}...")
        grids_fe[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
        grids_fe[model_name].fit(X_train_fe, y_train_fe)
        joblib.dump(grids_fe[model_name], model_file)
        print(f"Saved {model_name} model to {model_file}")
    
    # Optionally print best parameters and score
    best_params = grids_fe[model_name].best_params_
    best_score = np.sqrt(-1 * grids_fe[model_name].best_score_)
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')


Loaded LinearRegression model from LinearRegression_fe_model.pkl
Best parameters for LinearRegression: {}
Best RMSE for LinearRegression: 0.16689201032150458

Loaded RandomForest model from RandomForest_fe_model.pkl
Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}
Best RMSE for RandomForest: 0.14711665683882444

Loaded XGBoost model from XGBoost_fe_model.pkl
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best RMSE for XGBoost: 0.13940953657526153



In [57]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
import joblib

# Assuming you already have X_train_scaled_fe, X_test_scaled_fe, y_train_fe defined
# Create a copy of the feature sets
X_train_scaled_fe = X_train_fe.copy()
X_test_scaled_fe = X_test_fe.copy()
# Create an MLPRegressor instance
mlp = MLPRegressor(random_state=42, max_iter=10000, n_iter_no_change=3)

# Define the parameter grid for tuning
param_grid = {
    'hidden_layer_sizes': [(10,), (10, 10), (10, 25)],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'solver': ['adam', 'sgd'],
    'alpha': [.1, .5, 1, 10, 100],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init' : [0.1]
}

# Create the GridSearchCV object
grid_search_mlp_fe = GridSearchCV(mlp, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search_mlp_fe.fit(X_train_scaled_fe, y_train_fe)

# Save the trained GridSearchCV object
joblib.dump(grid_search_mlp_fe, 'DumpFiles/grid_search_mlp_fe.pkl')

# Print the best parameters found during the search
print("Best parameters found: ", grid_search_mlp_fe.best_params_)

# Evaluate the model on the test data
best_score = np.sqrt(-1 * grid_search_mlp_fe.best_score_)
print("Test score: ", best_score)

# Load the saved GridSearchCV object
loaded_grid_search_mlp_fe = joblib.load('DumpFiles/grid_search_mlp_fe.pkl')



Fitting 3 folds for each of 270 candidates, totalling 810 fits




324 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\anura\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\anura\anaconda3\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\anura\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 753, in fit
    return self._fit(X, y, incremental=False)
  File "c:\Users\anura\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line

Best parameters found:  {'activation': 'tanh', 'alpha': 0.5, 'hidden_layer_sizes': (10, 10), 'learning_rate': 'adaptive', 'learning_rate_init': 0.1, 'solver': 'sgd'}
Test score:  0.12786622291354935


In [58]:
from sklearn.metrics import mean_squared_error
for i in grids_fe.keys():
    print (i + ': ' + str(np.sqrt(mean_squared_error(grids_fe[i].predict(X_test_fe), y_test))))

LinearRegression: 0.19245547549346592
RandomForest: 0.16965767606335155
XGBoost: 0.17331061602137288


In [59]:
print( str(np.sqrt(mean_squared_error(grid_search_mlp_fe.predict(X_test_scaled_fe),y_test))))

0.1306771419031204


In [60]:
df_test = pd.read_csv("C:/Users/anura/OneDrive/Desktop/ME/DataProjects/HousePricePrediction/house-prices-advanced-regression-techniques/test.csv")

In [62]:
df_test_preprocessed = pipeline_fe.transform(df_test)

In [63]:
#xgboost submission

#here we will be taking exponential of prediction because we did logarithmic transformation while training as a normalisation step.
y_xgboost = np.exp(grids_fe['XGBoost'].predict(df_test_preprocessed))

df_xgboost_out = df_test[['Id']].copy()
df_xgboost_out['SalePrice'] = y_xgboost

#
df_xgboost_out.to_csv('Submissions/submission_xgboost_new_features_normalized.csv', index=False)

In [64]:
#randomforest submission
y_rf = np.exp(grids_fe['RandomForest'].predict(df_test_preprocessed))

df_rf_out = df_test[['Id']].copy()
df_rf_out['SalePrice'] = y_rf

#
df_rf_out.to_csv('submissions/submission_rf_new_features_normalized.csv', index=False)

In [65]:
#mlp submission
y_mlp = np.exp(grid_search_mlp_fe.predict(df_test_preprocessed))

df_mlp_out = df_test[['Id']].copy()
df_mlp_out['SalePrice'] = y_mlp

df_mlp_out.to_csv('Submissions/submission_mlp_normalized.csv', index=False)

In [67]:
#creating a submussion with ensemable model of rf,xgboost,and mlp
y_avg_ens = (y_rf + y_xgboost + y_mlp)/3

#xgboost submission
df_avg_ens_out = df_test[['Id']].copy()
df_avg_ens_out['SalePrice'] = y_avg_ens

#
df_avg_ens_out.to_csv('Submissions/submission_avg_ens_new_features_normalized.csv', index=False)

In [68]:
from sklearn.ensemble import StackingRegressor

grids_fe['MLP'] =   grid_search_mlp_fe

best_estimators = [(model_name, grid.best_estimator_) for model_name, grid in grids_fe.items()]

# Define the candidate meta-models
meta_models = {
    'MLP': MLPRegressor(random_state=42, max_iter=10000, n_iter_no_change=3, learning_rate_init=0.001),
    'LinearRegression': LinearRegression(),
    'XGBoost': XGBRegressor(random_state=42)
}

# Define the hyperparameter grids for each meta-model
meta_param_grids = {
    'MLP': {
        'final_estimator__hidden_layer_sizes': [(10,), (10, 10)],
        'final_estimator__activation': ['relu', 'tanh'],
        'final_estimator__solver': ['adam', 'sgd'],
        'final_estimator__alpha': [ 0.001, 0.01, .1, .5],
        'final_estimator__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'LinearRegression': {},
    'XGBoost': {
        'final_estimator__n_estimators': [100, 200, 500],
        'final_estimator__learning_rate': [0.01, 0.1, 0.3],
        'final_estimator__max_depth': [3, 6, 10],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)


best_model_file = "best_stacking_model.pkl"

try:
    # Try to load the model from file
    best_model = joblib.load(best_model_file)
    print("Loaded the best stacking ensemble model from file.")
except FileNotFoundError:
    # Define the candidate meta-models and other necessary variables
    # Your existing code for defining meta-models, hyperparameter grids, and cross-validation can remain unchanged
    
    # Train and tune the stacking ensemble
    best_score = float('inf')
    best_model = None

    for meta_name, meta_model in meta_models.items():
        print(f'Training and tuning {meta_name} as the meta-model...')
        stacking_regressor = StackingRegressor(estimators=best_estimators, final_estimator=meta_model, cv=cv)
        grid_search = GridSearchCV(estimator=stacking_regressor, param_grid=meta_param_grids[meta_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
        grid_search.fit(X_train_fe, y_train_fe)
        best_params = grid_search.best_params_
        best_rmse = np.sqrt(-1 * grid_search.best_score_)
    
        print(f'Best parameters for {meta_name}: {best_params}')
        print(f'Best RMSE for {meta_name}: {best_rmse}\n')
    
        if best_rmse < best_score:
            best_score = best_rmse
            best_model = grid_search

    # Save the best stacking ensemble model to file
    joblib.dump(best_model, best_model_file)
    print(f"Saved the best stacking ensemble model to {best_model_file}")

# Evaluate the best stacking ensemble on the test data
y_pred = best_model.predict(X_test_fe)
rmse = np.sqrt(mean_squared_error(y_test_fe, y_pred))
print(f"Best stacking ensemble's RMSE on test data: {rmse}")

Training and tuning MLP as the meta-model...
Fitting 3 folds for each of 96 candidates, totalling 288 fits


KeyboardInterrupt: 