In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


**Howdy!
This notebook is a simple reference file. Do upvote and support.**

Import required libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

Load the data

In [3]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

Combine the train and test data for preprocessing

In [4]:
combined = pd.concat([train, test], sort=False)

Feature Engineering

Add new features based on existing ones

In [5]:
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']
combined['TotalBathrooms'] = combined['FullBath'] + (0.5 * combined['HalfBath']) + combined['BsmtFullBath'] + (0.5 * combined['BsmtHalfBath'])
combined['TotalPorchSF'] = combined['OpenPorchSF'] + combined['EnclosedPorch'] + combined['3SsnPorch'] + combined['ScreenPorch']

Fill missing values

In [6]:
combined['MSZoning'] = combined['MSZoning'].fillna(combined['MSZoning'].mode()[0])
combined['Utilities'] = combined['Utilities'].fillna(combined['Utilities'].mode()[0])
combined['Exterior1st'] = combined['Exterior1st'].fillna(combined['Exterior1st'].mode()[0])
combined['Exterior2nd'] = combined['Exterior2nd'].fillna(combined['Exterior2nd'].mode()[0])
combined['KitchenQual'] = combined['KitchenQual'].fillna(combined['KitchenQual'].mode()[0])
combined['Functional'] = combined['Functional'].fillna(combined['Functional'].mode()[0])
combined['SaleType'] = combined['SaleType'].fillna(combined['SaleType'].mode()[0])

Encode categorical features

In [7]:
le = LabelEncoder()
categorical_features = combined.select_dtypes(include=['object']).columns
for col in categorical_features:
    combined[col] = le.fit_transform(combined[col])

Split the data back into train and test sets

In [8]:
train_processed = combined.iloc[:len(train), :]
test_processed = combined.iloc[len(train):, :]

Define X and y

In [9]:
X = train_processed.drop(['Id', 'SalePrice'], axis=1)
y = train_processed['SalePrice']

Split the train data into train and validation sets

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Fill missing values with mean

In [11]:
X_train.fillna(X_train.mean(), inplace=True)  

Random Forests (RFs)

In [12]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

Fill missing values with mean

In [13]:
X_val.fillna(X_val.mean(), inplace=True) 

Predict on the validation set

In [14]:
rf_preds = rf_model.predict(X_val)

Calculate the Root Mean Squared Error (RMSE) on the validation set

In [15]:
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_preds))
print("RF RMSE:", rf_rmse)

RF RMSE: 29181.164555052997


Gradient Boosting (GB)

In [16]:
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

Predict on the validation set

In [17]:
gb_preds = gb_model.predict(X_val)

Calculate the RMSE on the validation set

In [18]:
gb_rmse = np.sqrt(mean_squared_error(y_val, gb_preds))
print("GB RMSE:", gb_rmse)

GB RMSE: 27745.701484355628


Combine the train and test data for predictions

In [19]:
combined_preds = pd.concat([X_train, X_val, test_processed.drop(['Id', 'SalePrice'], axis=1)], sort=False)

Fill missing values with mean

In [20]:
combined_preds.fillna(combined_preds.mean(), inplace=True) 

Random Forests (RFs) predictions on the combined data

In [21]:
rf_model.fit(combined_preds.iloc[:len(y), :], y)
rf_preds_test = rf_model.predict(combined_preds.iloc[len(y):len(y)+len(test_processed), :])

Gradient Boosting (GB) predictions on the combined data

In [22]:
gb_model.fit(combined_preds.iloc[:len(y), :], y)
gb_preds_test = gb_model.predict(combined_preds.iloc[len(y):len(y)+len(test_processed), :])

Ensemble the predictions with a simple average

In [23]:
ensemble_preds = (rf_preds_test + gb_preds_test) / 2

Save predictions to a CSV file

In [24]:
submission = pd.DataFrame({'Id': test_processed['Id'], 'SalePrice': ensemble_preds})
submission.to_csv('submission3.csv', index=False)