In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

print("DONE")

DONE


In [4]:
train_id = train['Id']
test_id = test['Id']
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

In [6]:
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)

In [7]:
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
test[numeric_cols] = test[numeric_cols].fillna(test[numeric_cols].median())

categorical_cols = X.select_dtypes(include=['object']).columns
X[categorical_cols] = X[categorical_cols].fillna("None")
test[categorical_cols] = test[categorical_cols].fillna("None")

In [8]:
# Combine Train and Test for consistent processing
all_data = pd.concat([X, test])

# --- ENGINEERING NEW FEATURES ---

# 1. Total Square Footage
# Why: A big house is a big house. The model might struggle to add up basement + 1st floor + 2nd floor itself.
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# 2. House Age
# Why: 'YearBuilt' is just a number. 'Age' (2025 - YearBuilt) represents "wear and tear".
all_data['HouseAge'] = 2025 - all_data['YearBuilt']

# 3. Total Bathrooms
# Why: Combining full baths and half baths gives a better sense of luxury.
all_data['TotalBath'] = all_data['FullBath'] + (0.5 * all_data['HalfBath']) + all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath'])

# 4. Has Pool / Has Garage
# Why: Convert complex columns into simple Yes/No flags. Sometimes purely having a pool matters more than its size.
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)

# --- ENCODING ---

# Convert text ('Ex', 'Gd', 'TA') into numbers using One-Hot Encoding
all_data = pd.get_dummies(all_data)

# Split back into X and test
X = all_data.iloc[:len(train), :]
test = all_data.iloc[len(train):, :]

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [12]:
# Predict on validation set
predictions = rf.predict(X_val)

# Calculate RMSE
mse = mean_squared_error(y_val, predictions)
rmse = np.sqrt(mse)

# Calculate Kaggle-Style Log-RMSE (Optional but recommended)
log_rmse = np.sqrt(mean_squared_error(np.log(y_val), np.log(predictions)))

print(f"Standard RMSE: ${rmse:.2f}")
print(f"Kaggle Log-RMSE: {log_rmse:.4f}") # Aim for < 0.14 to be competitive

Standard RMSE: $29920.07
Kaggle Log-RMSE: 0.1522


In [13]:
# Retrain on ALL training data (not just the 80% split)
rf.fit(X, y)

# Predict on the real Test set
final_predictions = rf.predict(test)

# Create Submission DataFrame
submission = pd.DataFrame({
    "Id": test_id,
    "SalePrice": final_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")



Submission file created successfully!
