In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, KFold


In [2]:
# load the data 
train = pd.read_csv('/workspaces/codespaces-jupyter/Housing_dataset_train.csv')
test = pd.read_csv('/workspaces/codespaces-jupyter/Housing_dataset_test.csv')
sample_sub = pd.read_csv('/workspaces/codespaces-jupyter/Sample_submission.csv')

In [3]:
train.fillna(train.mean(), inplace=True)
train.fillna(train.mode().iloc[0], inplace=True)

In [4]:
train2 = train.copy()
test2 = test.copy()

In [5]:
def perform_feature_engineering(df):
    # Handle Categorical Features: One-hot encoding for 'loc' and 'title'
    ohe = OneHotEncoder(drop='first', sparse=False)
    encoded_features = ohe.fit_transform(df[['loc', 'title']])
    encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(['loc', 'title']))

    # Concatenate the encoded features with the original DataFrame
    df = pd.concat([df.drop(['loc', 'title'], axis=1), encoded_df], axis=1)

    # Creating Interaction Features
    df['total_rooms'] = df['bedroom'] + df['bathroom']
    df['bedroom_to_bathroom_ratio'] = df['bedroom'] / df['bathroom']

    return df

In [6]:
    # Perform feature engineering using the function
train2 = perform_feature_engineering(train2)
test2 = perform_feature_engineering(test2)

In [7]:
# split the data into train and test
X = train2.drop(['price'], axis=1)
y = train2['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# import the model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Initialize the regression models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Neural Network': MLPRegressor(max_iter=1000)  # Increase max_iter for larger datasets
}


# Train and evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append((name, rmse))

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'RMSE'])

# Sort by RMSE in ascending order
results_df = results_df.sort_values(by='RMSE', ascending=True)

# Print the best model and its RMSE
print("Best Model:")
print(results_df.head(1))

Best Model:
               Model           RMSE
2  Gradient Boosting  624866.595563


In [9]:
gbr2 = GradientBoostingRegressor()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150], # Number of boosting stages to be run
    'learning_rate': [0.05, 0.1, 0.2], # Shrinkage parameter to prevent overfitting
    'max_depth': [3, 4, 5], # Maximum depth of the individual trees
}

# Perform KFold cross-validation for hyperparameter tuning
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_params = None
best_rmse = float('inf')

for n_estimators in param_grid['n_estimators']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            rmses = []
            for train_idx, val_idx in kf.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                gbr2 = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
                gbr2.fit(X_train, y_train)
                y_pred = gbr2.predict(X_val)
                rmse = mean_squared_error(y_val, y_pred, squared=False)
                rmses.append(rmse)

            mean_rmse = np.mean(rmses)
            if mean_rmse < best_rmse:
                best_rmse = mean_rmse
                best_params = {
                    'n_estimators': n_estimators,
                    'learning_rate': learning_rate,
                    'max_depth': max_depth
                }

# Initialize the Gradient Boosting Regressor with the best hyperparameters
best_gbr2 = GradientBoostingRegressor(**best_params)

# Train the model on the training data
best_gbr2.fit(X, y)

# Predict on the test set
y_pred = best_gbr2.predict(X_test)

# Calculate and print RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error for Gradient Boosting Regressor = {rmse:.2f}")

Root Mean Squared Error for Gradient Boosting Regressor = 492538.12
