In [1]:
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from load_data import DataLoader
from datetime import date
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.neural_network import MLPRegressor


# Initialize the MinMaxScaler
scaler = MinMaxScaler()


Columns where it makes sense to delete outliers:
LotFrontage if above 300
LotArea if above 200000
BsmtFinSF1 if above 3000
TotalBsmtSF if above 4000
LowQualFinSF if above 1000
GrLivArea if above 4000
GarageYrBlt if above 2050
WoodDeckSF if above 1000
OpenPorchSF if above 700
EnclosedPorch if above 800


pd.set_option('display.max_columns', None)  # None means show all columns


In [36]:
dataloader = DataLoader()
dataloader.load_data('data/train.csv')
dataloader.preprocess_data()
df_train = dataloader.data

df_train_target = df_train['SalePrice']
df_train = df_train.drop(columns='SalePrice')

dataloader.load_data('data/test.csv')
dataloader.preprocess_data()
df_test = dataloader.data


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data[col].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data['MasVnrArea'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [37]:
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')

In [38]:
df_train_target.to_csv('df_train_target.csv')

In [16]:
missing_columns = [col for col in df_train.columns if col not in df_test]
df_test[missing_columns] = 0
df_test = df_test[df_train.columns]


In [17]:
import pandas as pd

def create_features_and_clean(df):
    # First, define initial features that don't depend on others in the dictionary
    initial_features = {
        'Age': df['YrSold'] - df['YearBuilt'],
        'RemodelAge': df['YrSold'] - df['YearRemodAdd'],
        'Total_Baths': df['BsmtFullBath'] + df['BsmtHalfBath'] * 0.5 + df['FullBath'] + df['HalfBath'],
        'Total_Finished_SF': df['GrLivArea'] + df['TotalBsmtSF'],
        'Total_Porch_Area': df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch'],
        'Garage_Age': df['YrSold'] - df['GarageYrBlt'],
        'Quality_Index': df['OverallQual'] + df['OverallCond'],
        'Total_Rooms': df['BedroomAbvGr'] + df['KitchenAbvGr'] + df['TotRmsAbvGrd']
    }
    
    # Add the initial features to the DataFrame
    df = pd.concat([df, pd.DataFrame(initial_features, index=df.index)], axis=1)

    # Now define additional features that depend on the initial features
    additional_features = {
        'Lot_Size_per_Room': df['LotArea'] / (df['Total_Rooms']).replace(0, 1),
        'Living_Area_per_Bath': df['GrLivArea'] / df['Total_Baths'].replace(0, 1),
        'Garage_Area_Ratio': df['GarageArea'] / df['Total_Finished_SF'].replace(0, 1),
        'Total_Porch_per_Room': df['Total_Porch_Area'] / df['Total_Rooms'].replace(0, 1),
        'Quality_Adjusted_Area': df['TotalBsmtSF'] * df['Quality_Index']
    }

    # Add the additional features to the DataFrame
    df = pd.concat([df, pd.DataFrame(additional_features, index=df.index)], axis=1)
    
    # Drop unnecessary columns
    columns_to_drop = ['YrSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    return df

# Apply the function to both training and test datasets
df_train = create_features_and_clean(df_train)
df_test = create_features_and_clean(df_test)


In [None]:
def calculate_vif(df, threshold=5.0):
    # Add a constant (intercept) column for VIF calculation
    df_with_constant = add_constant(df)

    # Calculate VIF for each feature
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df_with_constant.values, i + 1) for i in range(len(df.columns))]
    
    # Display VIF
    print("Variance Inflation Factors:\n", vif_data)
    
    # Identify features with VIF above the threshold
    high_vif_features = vif_data[vif_data["VIF"] > threshold]["Feature"].tolist()
    
    if high_vif_features:
        print(f"\nFeatures with VIF above {threshold}: {high_vif_features}")
    else:
        print(f"\nNo features found with VIF above {threshold}")
    
    return high_vif_features

# Select only numerical features for VIF calculation
numerical_features = df_train.select_dtypes(include=[float, int]).dropna(axis=1)

# Calculate VIF and identify high VIF features in the training data
high_vif_features = calculate_vif(numerical_features, threshold=5.0)

# Drop high-VIF columns (optional based on result)
df_train_dropped = df_train.drop(columns=high_vif_features)
df_test_dropped = df_test.drop(columns=high_vif_features)


  vif = 1. / (1. - r_squared_i)


Variance Inflation Factors:
                   Feature         VIF
0             LotFrontage    1.835224
1                 LotArea   16.350299
2                   Alley    1.258578
3                LotShape    1.291783
4             LandContour    1.755106
..                    ...         ...
62      Lot_Size_per_Room  408.409405
63   Living_Area_per_Bath   14.846993
64      Garage_Area_Ratio    7.155979
65   Total_Porch_per_Room  402.677860
66  Quality_Adjusted_Area   60.558367

[67 rows x 2 columns]

Features with VIF above 5.0: ['LotArea', 'OverallQual', 'OverallCond', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'FireplaceQu', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Age', 'RemodelAge', 'Total_Baths', 'Tot

In [20]:
# Convert all boolean columns to integers (1 for True, 0 for False)
df_train = df_train_dropped.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
df_test = df_test_dropped.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)


In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(df_train, df_train_target, test_size=0.2, random_state=42)

# Prepare a list of models to evaluate, including MLPRegressor
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Linear Regression": LinearRegression(),
    "Support Vector Regressor": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(100, ), max_iter=10000, random_state=42)  # Adding the neural network
}

# Dictionary to store the results
results = {}

# Evaluate each model
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    # Predict on the test set
    predictions = model.predict(X_test)
    # Calculate mean squared error
    mse = mean_squared_error(y_test, predictions)
    results[model_name] = mse

# Print out the results
for model_name, mse in results.items():
    print(f"{model_name}: MSE = {mse:.2f}")


Neural Network: MSE = 227703317.43




In [34]:
1577091444.8393743 - 1370980223.5285773

206111221.31079698

In [None]:
sorted(results.values())

{'Random Forest': np.float64(1370980223.5285773),
 'Linear Regression': np.float64(5.041245555235461e+31),
 'Support Vector Regressor': np.float64(7857597159.34528),
 'K-Neighbors Regressor': np.float64(2550301243.0965753),
 'Gradient Boosting': np.float64(1375905119.823203),
 'AdaBoost': np.float64(2482791716.224041),
 'XGBoost': np.float64(1577091444.8393743),
 'Neural Network': np.float64(14521672889.864958)}

In [35]:
# 1. Fine-tuning Random Forest
rf_param_grid = {
    'max_depth': [10, 12, 14],
    'n_estimators': [100, 200],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = RandomForestRegressor(random_state=42)

# Use tqdm with GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)

# Fit GridSearchCV with tqdm progress bar
with tqdm(total=len(rf_param_grid['max_depth']) * len(rf_param_grid['n_estimators']) * len(rf_param_grid['min_samples_split']) * len(rf_param_grid['min_samples_leaf']),
            desc="Random Forest") as pbar:
    rf_grid_search.fit(df_train, df_train_target)
    pbar.update()  # Update after each fit (not necessary since GridSearchCV updates internally)

# Print best parameters and best score for Random Forest
print("Random Forest Best Parameters:", rf_grid_search.best_params_)
print("Random Forest Best CV Score (neg_mean_squared_error):", rf_grid_search.best_score_)

# Get the best Random Forest model
best_model1 = rf_grid_search.best_estimator_

# 2. Fine-tuning Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'min_samples_split': [2, 5],
}

gb_model = GradientBoostingRegressor(random_state=42)

# Use tqdm with GridSearchCV
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)

# Fit GridSearchCV with tqdm progress bar
with tqdm(total=len(gb_param_grid['n_estimators']) * len(gb_param_grid['max_depth']) * len(gb_param_grid['learning_rate']) * len(gb_param_grid['min_samples_split']),
            desc="Gradient Boosting") as pbar:
    gb_grid_search.fit(df_train, df_train_target)
    pbar.update()

# Print best parameters and best score for Gradient Boosting
print("Gradient Boosting Best Parameters:", gb_grid_search.best_params_)
print("Gradient Boosting Best CV Score (neg_mean_squared_error):", gb_grid_search.best_score_)

# Get the best Gradient Boosting model
best_model2 = gb_grid_search.best_estimator_

# 3. Fine-tuning XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42)

# Use tqdm with GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)

# Fit GridSearchCV with tqdm progress bar
with tqdm(total=len(xgb_param_grid['n_estimators']) * len(xgb_param_grid['max_depth']) * len(xgb_param_grid['learning_rate']) * len(xgb_param_grid['subsample']),
            desc="XGBoost") as pbar:
    xgb_grid_search.fit(df_train, df_train_target)
    pbar.update()

# Print best parameters and best score for XGBoost
print("XGBoost Best Parameters:", xgb_grid_search.best_params_)
print("XGBoost Best CV Score (neg_mean_squared_error):", xgb_grid_search.best_score_)

# Get the best XGBoost model
best_model3 = xgb_grid_search.best_estimator_


Random Forest:   0%|          | 0/24 [00:33<?, ?it/s]


KeyboardInterrupt: 

In [6]:
# Make predictions with each of the best models
y_pred1 = best_model1.predict(df_test)
y_pred2 = best_model2.predict(df_test)
y_pred3 = best_model3.predict(df_test)

# Average the predictions
y_pred_avg = (y_pred1 + y_pred2 + y_pred3) / 3

# Create the DataFrame for submission
df_guess = pd.DataFrame(y_pred_avg, columns=['SalePrice'])
df_guess['Id'] = range(1461, 2920)  # Adjust the range according to your test set ID
df_guess = df_guess[['Id', 'SalePrice']]

# Get today's date
today = date.today()

# Save the predictions to a CSV file
df_guess.to_csv(f'predictions/prediction_{today}.csv', index=False)

print("Predictions saved successfully!")
