## 1. Data Collection

In [3]:
import pandas as pd

# Load the training and test datasets
train_df = pd.read_csv('/content/test.csv')
test_df = pd.read_csv('/content/train.csv')

# Check dataset shapes
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (1459, 80)
Test shape: (1460, 81)


## 2. Exploratory Data Analysis (EDA)

In [4]:
# Basic info
train_df.info()
train_df.describe()

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of target variable
sns.histplot(train_df['SalePrice'], kde=True)
plt.title('Target Variable Distribution')
plt.show()

# Correlation heatmap (top correlated features)
corr = train_df.corr(numeric_only=True)
top_corr = corr['SalePrice'].sort_values(ascending=False)[1:11]
sns.heatmap(train_df[top_corr.index].corr(), annot=True, cmap='coolwarm')
plt.title('Top Correlated Features with SalePrice')
plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

KeyError: 'SalePrice'

## 3. Data Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Drop ID columns
train_df = train_df.drop(columns=['Id'])
test_ids = test_df['Id']
test_df = test_df.drop(columns=['Id'])

# Separate features and target
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


## 4. Model Training

In [None]:
# Final pipeline with model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train the model
model.fit(X_train, y_train)


## 5. Model Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on validation set
y_pred = model.predict(X_val)

# Metrics
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"RÂ² Score: {r2:.2f}")


## 6. Save the Trained Model

In [None]:
import joblib
joblib.dump(model, 'house_price_model.pkl')


## 7. Make Predictions on Test Set

In [None]:
# Predict on test set
test_preds = model.predict(test_df)

# Prepare submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_preds
})

submission.to_csv('submission.csv', index=False)


## 8. Monitoring & Maintenance

- Log predictions and monitor for concept drift.
- Retrain periodically using fresh data.
- Consider model versioning and performance dashboards.
