In [3]:
pip install --upgrade scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [7]:
# ========== Imports ==========
import numpy as np
import pandas as pd

# For splitting dataset and searching for best hyperparameters
from sklearn.model_selection import train_test_split, GridSearchCV

# For dealing with missing values
from sklearn.impute import SimpleImputer

# For encoding categorical features
from sklearn.preprocessing import OneHotEncoder

# For applying different transformations to numeric vs. categorical columns
from sklearn.compose import ColumnTransformer

# For chaining transformations + model into a single workflow
from sklearn.pipeline import Pipeline

# Our regression model: Random Forest
from sklearn.ensemble import RandomForestRegressor

# Metric for regression error
from sklearn.metrics import mean_squared_error


# 1. Load data
train = pd.read_csv("train.csv")  # Training data
test = pd.read_csv("test.csv")    # Test data (no target column)

# Identify the target variable
TARGET = "SalePrice"

# Separate features (X) and target (y) from the training dataset
X = train.drop([TARGET], axis=1)  # All columns except SalePrice
y = train[TARGET]                 # The SalePrice column alone

# 2. Check for missing values (just for reference)
missing_counts = X.isna().sum().sort_values(ascending=False)
print("Missing value counts (top 10):\n", missing_counts.head(10))

# 3. Handle specific missing values (example):
# Fill missing numeric 'LotFrontage' with median
X["LotFrontage"] = X["LotFrontage"].fillna(X["LotFrontage"].median())
test["LotFrontage"] = test["LotFrontage"].fillna(X["LotFrontage"].median())

# Fill missing categorical with "None"
cat_cols_tmp = X.select_dtypes(include=['object']).columns
for col in cat_cols_tmp:
    X[col] = X[col].fillna("None")
    if col in test.columns:
        test[col] = test[col].fillna("None")

# 4. Split data into train/validation for local evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

# 5. Identify numeric/categorical columns for preprocessing
num_cols = X_train.select_dtypes(include=np.number).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

# Create numeric transformer (median imputer here)
numeric_transformer = SimpleImputer(strategy='median')

# Create categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# 6. Combine these into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols),
    ]
)

# 7. Define the RandomForestRegressor model
#    Here, we set a few basic hyperparameters. You can tune further if needed.
model = RandomForestRegressor(
    n_estimators=100,  # number of trees
    random_state=42    # for reproducibility
)

# 8. Build the Pipeline that first transforms (imputes/encodes) and then trains
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# 9. Fit (train) the pipeline on the training set
pipeline.fit(X_train, y_train)

# 10. Evaluate on the validation set
# Make predictions on the validation data
y_pred_val = pipeline.predict(X_val)

# Calculate validation RMSE
val_mse = mean_squared_error(y_val, y_pred_val)
val_rmse = np.sqrt(val_mse)
print("Validation RMSE:", val_rmse)

# 11. [Optional] Hyperparameter Tuning Example (Commented Out)
"""
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    scoring='neg_root_mean_squared_error', 
    cv=5,        # 5-fold cross-validation
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate best model on validation set
val_preds = best_model.predict(X_val)
val_rmse_tuned = mean_squared_error(y_val, val_preds, squared=False)
print("Tuned Validation RMSE:", val_rmse_tuned)
"""

# 12. Retrain on the full dataset for final predictions
# (Comment out the above if using the tuned model from grid search)
pipeline.fit(X, y)

# 13. Predict on the test set
test_preds = pipeline.predict(test)

# 14. Save predictions to a CSV if needed
output = pd.DataFrame({'Id': test['Id'], 'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
print("Final predictions saved to submissionRandomforest.csv")


Missing value counts (top 10):
 PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageQual        81
GarageFinish      81
dtype: int64
Validation RMSE: 28922.11135465959
Final predictions saved to submissionRandomforest.csv
