In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib  # For saving the model

# Load the original data to get the Id column
original_train = pd.read_csv("house_prices_data/train.csv")
original_test = pd.read_csv("house_prices_data/test.csv")

# Load the upd_train and upd_test
upd_train = pd.read_csv("upd_train.csv")
upd_test = pd.read_csv("upd_test.csv")

# Separate features and target variable in the train dataset
X_train = upd_train.drop(columns=['SalePrice'])  # Drop 'SalePrice'
y_train = upd_train['SalePrice']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the model (RandomForestRegressor as an example)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model using joblib
joblib.dump(model, 'house_price_model.pkl')  # Model saved as house_price_model.pkl

# Generate predictions on the test data (keep 'Id' for submission)
X_test = upd_test  # Do not drop 'Id' yet
predictions = model.predict(X_test)

# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': original_test['Id'],  # Use the Id from the original test data
    'SalePrice': predictions  # The predicted SalePrice values
})

# Save the submission DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)

print("Submission file has been saved as 'submission.csv'.")
print("Model has been saved as 'house_price_model.pkl'.")


Submission file has been saved as 'submission.csv'.
Model has been saved as 'house_price_model.pkl'.


In [7]:
# Import necessary libraries for evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict the target values (SalePrice) on the validation set
y_pred = model.predict(X_val)

# Calculate evaluation metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Print the evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 17453.885273972603
Mean Squared Error (MSE): 827498078.188561
Root Mean Squared Error (RMSE): 28766.266323396245
R-squared (R²): 0.8921169640199667


In [8]:
submission

Unnamed: 0,Id,SalePrice
0,1461,129247.50
1,1462,151231.50
2,1463,178607.38
3,1464,187916.00
4,1465,204525.12
...,...,...
1454,2915,88876.50
1455,2916,90536.00
1456,2917,156106.97
1457,2918,118891.00
