In [44]:
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from load_data import DataLoader
from datetime import date
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.neural_network import MLPRegressor


# Initialize the MinMaxScaler
scaler = MinMaxScaler()


In [33]:
pd.set_option('display.max_columns', None)  # None means show all columns


In [28]:
dataloader = DataLoader()
dataloader.load_data('data/train.csv')
dataloader.preprocess_data()
df_train = dataloader.data

df_train_target = df_train['SalePrice']
df_train = df_train.drop(columns='SalePrice')

dataloader.load_data('data/test.csv')
dataloader.preprocess_data()
df_test = dataloader.data


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data[col].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data['MasVnrArea'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [29]:
missing_columns = [col for col in df_train.columns if col not in df_test]
df_test[missing_columns] = 0
df_test = df_test[df_train.columns]


In [37]:
def create_features_and_clean(df):
    df['Age'] = df['YrSold'] - df['YearBuilt']
    df['RemodelAge'] = df['YrSold'] - df['YearRemodAdd']
    df['Total_Baths'] = df['BsmtFullBath'] + df['BsmtHalfBath'] * 0.5 + df['FullBath'] + df['HalfBath']
    df['Total_Finished_SF'] = df['GrLivArea'] + df['TotalBsmtSF']
    df['Total_Porch_Area'] = (df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] +
                               df['3SsnPorch'] + df['ScreenPorch'])
    df['Garage_Age'] = df['YrSold'] - df['GarageYrBlt']
    df['Quality_Index'] = df['OverallQual'] + df['OverallCond']
    df['Total_Rooms'] = df['BedroomAbvGr'] + df['KitchenAbvGr'] + df['TotRmsAbvGrd']

    # Create multiplicative features
    df['Lot_Size_per_Room'] = df['LotArea'] / (df['BedroomAbvGr'] + df['KitchenAbvGr'] + df['TotRmsAbvGrd']).replace(0, 1)
    df['Living_Area_per_Bath'] = df['GrLivArea'] / df['Total_Baths'].replace(0, 1)
    df['Garage_Area_Ratio'] = df['GarageArea'] / df['Total_Finished_SF'].replace(0, 1)
    df['Total_Porch_per_Room'] = (df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + 
                                   df['3SsnPorch'] + df['ScreenPorch']) / (df['BedroomAbvGr'] + 
                                   df['KitchenAbvGr'] + df['TotRmsAbvGrd']).replace(0, 1)
    df['Quality_Adjusted_Area'] = df['TotalBsmtSF'] * (df['OverallQual'] + df['OverallCond'])

    # Define columns to drop (adjust as necessary)
    columns_to_drop = ['YrSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

    # Drop unnecessary columns
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Create features for both training and test datasets
create_features_and_clean(df_train)
create_features_and_clean(df_test)


  df['Quality_Index'] = df['OverallQual'] + df['OverallCond']
  df['Total_Rooms'] = df['BedroomAbvGr'] + df['KitchenAbvGr'] + df['TotRmsAbvGrd']
  df['Lot_Size_per_Room'] = df['LotArea'] / (df['BedroomAbvGr'] + df['KitchenAbvGr'] + df['TotRmsAbvGrd']).replace(0, 1)
  df['Living_Area_per_Bath'] = df['GrLivArea'] / df['Total_Baths'].replace(0, 1)
  df['Garage_Area_Ratio'] = df['GarageArea'] / df['Total_Finished_SF'].replace(0, 1)
  df['Total_Porch_per_Room'] = (df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] +
  df['Quality_Adjusted_Area'] = df['TotalBsmtSF'] * (df['OverallQual'] + df['OverallCond'])


In [41]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(df_train, df_train_target, test_size=0.2, random_state=42)

# Prepare a list of models to evaluate, including MLPRegressor
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Linear Regression": LinearRegression(),
    "Support Vector Regressor": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(100, ), max_iter=1000, random_state=42)  # Adding the neural network
}

# Dictionary to store the results
results = {}

# Evaluate each model
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    # Predict on the test set
    predictions = model.predict(X_test)
    # Calculate mean squared error
    mse = mean_squared_error(y_test, predictions)
    results[model_name] = mse

# Print out the results
for model_name, mse in results.items():
    print(f"{model_name}: MSE = {mse:.2f}")


Random Forest: MSE = 822188137.23
Linear Regression: MSE = 38024979125706087965004922880.00
Support Vector Regressor: MSE = 7857868970.10
K-Neighbors Regressor: MSE = 2184084644.40
Gradient Boosting: MSE = 750083279.42
AdaBoost: MSE = 1375248103.07
XGBoost: MSE = 852645450.30
Neural Network: MSE = 6807854310.67




In [None]:
# 1. Fine-tuning Random Forest
rf_param_grid = {
    'max_depth': [10, 12, 14],
    'n_estimators': [100, 200],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = RandomForestRegressor(random_state=42)

# Use tqdm with GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)

# Fit GridSearchCV with tqdm progress bar
with tqdm(total=len(rf_param_grid['max_depth']) * len(rf_param_grid['n_estimators']) * len(rf_param_grid['min_samples_split']) * len(rf_param_grid['min_samples_leaf']),
            desc="Random Forest") as pbar:
    rf_grid_search.fit(df_train, df_train_target)
    pbar.update()  # Update after each fit (not necessary since GridSearchCV updates internally)

# Print best parameters and best score for Random Forest
print("Random Forest Best Parameters:", rf_grid_search.best_params_)
print("Random Forest Best CV Score (neg_mean_squared_error):", rf_grid_search.best_score_)

# Get the best Random Forest model
best_model1 = rf_grid_search.best_estimator_

# 2. Fine-tuning Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'min_samples_split': [2, 5],
}

gb_model = GradientBoostingRegressor(random_state=42)

# Use tqdm with GridSearchCV
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)

# Fit GridSearchCV with tqdm progress bar
with tqdm(total=len(gb_param_grid['n_estimators']) * len(gb_param_grid['max_depth']) * len(gb_param_grid['learning_rate']) * len(gb_param_grid['min_samples_split']),
            desc="Gradient Boosting") as pbar:
    gb_grid_search.fit(df_train, df_train_target)
    pbar.update()

# Print best parameters and best score for Gradient Boosting
print("Gradient Boosting Best Parameters:", gb_grid_search.best_params_)
print("Gradient Boosting Best CV Score (neg_mean_squared_error):", gb_grid_search.best_score_)

# Get the best Gradient Boosting model
best_model2 = gb_grid_search.best_estimator_

# 3. Fine-tuning XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42)

# Use tqdm with GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)

# Fit GridSearchCV with tqdm progress bar
with tqdm(total=len(xgb_param_grid['n_estimators']) * len(xgb_param_grid['max_depth']) * len(xgb_param_grid['learning_rate']) * len(xgb_param_grid['subsample']),
            desc="XGBoost") as pbar:
    xgb_grid_search.fit(df_train, df_train_target)
    pbar.update()

# Print best parameters and best score for XGBoost
print("XGBoost Best Parameters:", xgb_grid_search.best_params_)
print("XGBoost Best CV Score (neg_mean_squared_error):", xgb_grid_search.best_score_)

# Get the best XGBoost model
best_model3 = xgb_grid_search.best_estimator_


Random Forest:   0%|          | 0/24 [00:00<?, ?it/s]

In [6]:
# Make predictions with each of the best models
y_pred1 = best_model1.predict(df_test)
y_pred2 = best_model2.predict(df_test)
y_pred3 = best_model3.predict(df_test)

# Average the predictions
y_pred_avg = (y_pred1 + y_pred2 + y_pred3) / 3

# Create the DataFrame for submission
df_guess = pd.DataFrame(y_pred_avg, columns=['SalePrice'])
df_guess['Id'] = range(1461, 2920)  # Adjust the range according to your test set ID
df_guess = df_guess[['Id', 'SalePrice']]

# Get today's date
today = date.today()

# Save the predictions to a CSV file
df_guess.to_csv(f'predictions/prediction_{today}.csv', index=False)

print("Predictions saved successfully!")
