In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from load_data import DataLoader
from datetime import date
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Initialize the MinMaxScaler
scaler = MinMaxScaler()


In [2]:
dataloader = DataLoader()
dataloader.load_data('data/train.csv')
dataloader.preprocess_data()
df_train = dataloader.data

df_train_target = df_train['SalePrice']
df_train = df_train.drop(columns='SalePrice')

dataloader.load_data('data/test.csv')
dataloader.preprocess_data()
df_test = dataloader.data


In [3]:
missing_columns = [col for col in df_train.columns if col not in df_test]
df_test[missing_columns] = 0
df_test = df_test[df_train.columns]


In [5]:
# Define the parameter grid to search over
param_grid = {
    'max_depth': [12],
    'n_estimators': [100],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}


model = RandomForestRegressor(max_depth=21, n_estimators=100, random_state=42)
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Fit GridSearchCV
grid_search.fit(df_train, df_train_target)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (neg_mean_squared_error):", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Score (neg_mean_squared_error): -915084761.0303571


In [6]:
best_model.fit(df_train, df_train_target)

y_pred = best_model.predict(df_test)
df_guess = pd.DataFrame(y_pred, columns=['SalePrice'])
df_guess['Id'] = range(1461, 2920)
df_guess = df_guess[['Id', 'SalePrice']]
# Get today's date
today = date.today()
df_guess.to_csv(f'predictions/prediction_{today}.csv', index=False)
