In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv(r"E:\dataset\house\data.csv")

In [3]:
df = dataset.copy()
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [4]:
df = df[df['price'] != 0]
print(f"\nShape after removing zero prices: {df.shape}")



Shape after removing zero prices: (4551, 18)


In [5]:
# Convert date column
df['date'] = pd.to_datetime(df['date'])

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Handle missing values if any
df = df.dropna()  # Or use appropriate imputation strategy
print(f"Shape after handling missing values: {df.shape}")



Missing values:
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64
Shape after handling missing values: (4551, 18)


In [6]:
df['sale_year'] = df['date'].dt.year
df['sale_month'] = df['date'].dt.month
df['sale_quarter'] = df['date'].dt.quarter
df['sale_day_of_year'] = df['date'].dt.dayofyear
df['sale_weekday'] = df['date'].dt.weekday

# Season features
df['is_spring'] = ((df['sale_month'] >= 3) & (df['sale_month'] <= 5)).astype(int)
df['is_summer'] = ((df['sale_month'] >= 6) & (df['sale_month'] <= 8)).astype(int)
df['is_fall'] = ((df['sale_month'] >= 9) & (df['sale_month'] <= 11)).astype(int)
df['is_winter'] = ((df['sale_month'] == 12) | (df['sale_month'] <= 2)).astype(int)
df['is_peak_season'] = ((df['sale_month'] >= 4) & (df['sale_month'] <= 6)).astype(int)

# Room-based features
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df['bedroom_bathroom_ratio'] = df['bedrooms'] / (df['bathrooms'] + 1)  

In [7]:
# Check for outliers in price and handle them
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"\nPrice statistics before outlier removal:")
print(f"Mean: ${df['price'].mean():,.2f}")
print(f"Median: ${df['price'].median():,.2f}")
print(f"Outliers below {lower_bound:,.2f} or above {upper_bound:,.2f}")



Price statistics before outlier removal:
Mean: $557,905.90
Median: $465,000.00
Outliers below -170,589.29 or above 1,154,353.57


In [10]:
outlier_mask = (df['price'] >= lower_bound) & (df['price'] <= upper_bound)
print(f"Removing {(~outlier_mask).sum()} outliers")
df = df[outlier_mask]

print(f"Final dataset shape: {df.shape}")


Removing 240 outliers
Final dataset shape: (4311, 30)


In [13]:
X_temp = df.drop(['price'], axis=1)
y = df['price']

# Split into train and test
X_train_temp, X_test_temp, y_train, y_test = train_test_split(
    X_temp, y, test_size=0.2, random_state=42, stratify=None
)

# Create copies to work with
X_train = X_train_temp.copy()
X_test = X_test_temp.copy()

In [14]:
print("\nCreating target-based features...")

# 1. City average price (using train data only)
city_stats = X_train.assign(price=y_train).groupby('city')['price'].agg(['mean', 'count', 'std']).reset_index()
city_stats.columns = ['city', 'avg_city_price', 'city_property_count', 'city_price_std']
city_stats['city_price_std'] = city_stats['city_price_std'].fillna(0)

# Merge with train and test
X_train = X_train.merge(city_stats, on='city', how='left')
X_test = X_test.merge(city_stats, on='city', how='left')



Creating target-based features...


In [15]:
# Handle unseen cities in test
overall_train_avg = y_train.mean()
overall_train_std = y_train.std()
X_test['avg_city_price'] = X_test['avg_city_price'].fillna(overall_train_avg)
X_test['city_property_count'] = X_test['city_property_count'].fillna(0)
X_test['city_price_std'] = X_test['city_price_std'].fillna(overall_train_std)


In [16]:

# 2. Country average price (using train data only)
country_stats = X_train.assign(price=y_train).groupby('country')['price'].agg(['mean', 'count']).reset_index()
country_stats.columns = ['country', 'avg_country_price', 'country_property_count']

X_train = X_train.merge(country_stats, on='country', how='left')
X_test = X_test.merge(country_stats, on='country', how='left')

X_test['avg_country_price'] = X_test['avg_country_price'].fillna(overall_train_avg)
X_test['country_property_count'] = X_test['country_property_count'].fillna(0)


In [17]:
# 3. Street frequency encoding (better than target encoding for high cardinality)
street_counts = X_train['street'].value_counts()
X_train['street_frequency'] = X_train['street'].map(street_counts)
X_test['street_frequency'] = X_test['street'].map(street_counts).fillna(0)


In [19]:
 
luxury_threshold = y_train.quantile(0.75)  # Top 25% as luxury
X_train['is_luxury'] = (y_train > luxury_threshold).astype(int)
# For test set, we can't use actual price, so use city average as proxy
X_test['is_luxury'] = (X_test['avg_city_price'] > luxury_threshold).astype(int)

room_luxury_threshold = X_train['total_rooms'].quantile(0.75)
X_train['is_luxury_rooms'] = (X_train['total_rooms'] > room_luxury_threshold).astype(int)
X_test['is_luxury_rooms'] = (X_test['total_rooms'] > room_luxury_threshold).astype(int)

In [21]:
# CATEGORICAL ENCODING
print("Encoding categorical variables...")

# City encoding - use frequency encoding for high cardinality
city_freq = X_train['city'].value_counts()
X_train['city_encoded'] = X_train['city'].map(city_freq)
X_test['city_encoded'] = X_test['city'].map(city_freq).fillna(0)

# Country encoding - label encoding for lower cardinality
le_country = LabelEncoder()
X_train['country_encoded'] = le_country.fit_transform(X_train['country'])

# Handle unseen countries in test
country_mapping = dict(zip(le_country.classes_, le_country.transform(le_country.classes_)))
X_test['country_encoded'] = X_test['country'].map(country_mapping).fillna(-1)


Encoding categorical variables...


In [22]:
# Drop original categorical columns and other unnecessary columns
cols_to_drop = ['date', 'street', 'city', 'statezip', 'country']
X_train_final = X_train.drop(cols_to_drop, axis=1)
X_test_final = X_test.drop(cols_to_drop, axis=1)

print(f"\nFinal feature set shape: {X_train_final.shape}")
print("Features:", X_train_final.columns.tolist())

# Check for any remaining missing values
print(f"\nMissing values in train: {X_train_final.isnull().sum().sum()}")
print(f"Missing values in test: {X_test_final.isnull().sum().sum()}")

# Fill any remaining missing values
X_train_final = X_train_final.fillna(0)
X_test_final = X_test_final.fillna(0)


Final feature set shape: (3448, 34)
Features: ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'sale_year', 'sale_month', 'sale_quarter', 'sale_day_of_year', 'sale_weekday', 'is_spring', 'is_summer', 'is_fall', 'is_winter', 'is_peak_season', 'total_rooms', 'bedroom_bathroom_ratio', 'avg_city_price', 'city_property_count', 'city_price_std', 'avg_country_price', 'country_property_count', 'street_frequency', 'is_luxury', 'is_luxury_rooms', 'city_encoded', 'country_encoded']

Missing values in train: 849
Missing values in test: 0


In [23]:
# Feature scaling
print("\nApplying feature scaling...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)


Applying feature scaling...


In [24]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectPercentile, f_regression

# Create polynomial features for key numerical columns
key_numerical_cols = ['bedrooms', 'bathrooms', 'total_rooms', 'sale_year']
poly_data_train = X_train_final[key_numerical_cols]
poly_data_test = X_test_final[key_numerical_cols]

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly_features_train = poly.fit_transform(poly_data_train)
poly_features_test = poly.transform(poly_data_test)

# Add polynomial features to main dataset
poly_feature_names = poly.get_feature_names_out(key_numerical_cols)
poly_df_train = pd.DataFrame(poly_features_train, columns=poly_feature_names, index=X_train_final.index)
poly_df_test = pd.DataFrame(poly_features_test, columns=poly_feature_names, index=X_test_final.index)

# Combine with original features (remove duplicate base features)
base_cols_to_remove = [col for col in key_numerical_cols if col in poly_feature_names]
X_train_enhanced = pd.concat([X_train_final.drop(base_cols_to_remove, axis=1), poly_df_train], axis=1)
X_test_enhanced = pd.concat([X_test_final.drop(base_cols_to_remove, axis=1), poly_df_test], axis=1)


In [25]:
# Feature selection to remove less important features
print("Applying feature selection...")
selector = SelectPercentile(f_regression, percentile=75)
X_train_selected = selector.fit_transform(X_train_enhanced, y_train)
X_test_selected = selector.transform(X_test_enhanced)

print(f"Features after selection: {X_train_selected.shape[1]} (from {X_train_enhanced.shape[1]})")

# Scale the enhanced features
scaler_enhanced = StandardScaler()
X_train_scaled_enhanced = scaler_enhanced.fit_transform(X_train_selected)
X_test_scaled_enhanced = scaler_enhanced.transform(X_test_selected)

Applying feature selection...
Features after selection: 30 (from 40)


In [26]:
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score


# Model Training and Evaluation
print("\n" + "="*50)
print("COMPREHENSIVE MODEL TRAINING AND EVALUATION")
print("="*50)

# Dictionary to store all models and their results
models_results = {}

# 1. Random Forest (Optimized)
print("\n1. Random Forest Regressor:")
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=3,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_final, y_train)
rf_pred = rf_model.predict(X_test_final)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_mae = mean_absolute_error(y_test, rf_pred)
models_results['Random Forest'] = {'model': rf_model, 'r2': rf_r2, 'rmse': rf_rmse, 'mae': rf_mae, 'predictions': rf_pred}
print(f"R²: {rf_r2:.4f}, RMSE: ${rf_rmse:,.2f}, MAE: ${rf_mae:,.2f}")



COMPREHENSIVE MODEL TRAINING AND EVALUATION

1. Random Forest Regressor:
R²: 0.6965, RMSE: $119,269.10, MAE: $83,061.07


In [27]:
print("\n4. Gradient Boosting Regressor:")
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)
gb_model.fit(X_train_final, y_train)
gb_pred = gb_model.predict(X_test_final)
gb_r2 = r2_score(y_test, gb_pred)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
gb_mae = mean_absolute_error(y_test, gb_pred)
models_results['Gradient Boosting'] = {'model': gb_model, 'r2': gb_r2, 'rmse': gb_rmse, 'mae': gb_mae, 'predictions': gb_pred}
print(f"R²: {gb_r2:.4f}, RMSE: ${gb_rmse:,.2f}, MAE: ${gb_mae:,.2f}")



4. Gradient Boosting Regressor:
R²: 0.6845, RMSE: $121,620.26, MAE: $84,886.28


In [28]:
print("\n5. Extra Trees Regressor:")
et_model = ExtraTreesRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=3,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)
et_model.fit(X_train_final, y_train)
et_pred = et_model.predict(X_test_final)
et_r2 = r2_score(y_test, et_pred)
et_rmse = np.sqrt(mean_squared_error(y_test, et_pred))
et_mae = mean_absolute_error(y_test, et_pred)
models_results['Extra Trees'] = {'model': et_model, 'r2': et_r2, 'rmse': et_rmse, 'mae': et_mae, 'predictions': et_pred}
print(f"R²: {et_r2:.4f}, RMSE: ${et_rmse:,.2f}, MAE: ${et_mae:,.2f}")



5. Extra Trees Regressor:
R²: 0.7066, RMSE: $117,267.50, MAE: $81,661.80


In [29]:
# 6. Ridge Regression
print("\n6. Ridge Regression:")
ridge_model = Ridge(alpha=100, random_state=42)
ridge_model.fit(X_train_scaled_enhanced, y_train)
ridge_pred = ridge_model.predict(X_test_scaled_enhanced)
ridge_r2 = r2_score(y_test, ridge_pred)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
ridge_mae = mean_absolute_error(y_test, ridge_pred)
models_results['Ridge'] = {'model': ridge_model, 'r2': ridge_r2, 'rmse': ridge_rmse, 'mae': ridge_mae, 'predictions': ridge_pred}
print(f"R²: {ridge_r2:.4f}, RMSE: ${ridge_rmse:,.2f}, MAE: ${ridge_mae:,.2f}")



6. Ridge Regression:
R²: 0.6564, RMSE: $126,905.07, MAE: $93,057.98


In [30]:
print("\n7. Lasso Regression:")
lasso_model = Lasso(alpha=1000, random_state=42, max_iter=2000)
lasso_model.fit(X_train_scaled_enhanced, y_train)
lasso_pred = lasso_model.predict(X_test_scaled_enhanced)
lasso_r2 = r2_score(y_test, lasso_pred)
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
lasso_mae = mean_absolute_error(y_test, lasso_pred)
models_results['Lasso'] = {'model': lasso_model, 'r2': lasso_r2, 'rmse': lasso_rmse, 'mae': lasso_mae, 'predictions': lasso_pred}
print(f"R²: {lasso_r2:.4f}, RMSE: ${lasso_rmse:,.2f}, MAE: ${lasso_mae:,.2f}")



7. Lasso Regression:
R²: 0.6619, RMSE: $125,889.58, MAE: $91,948.89


In [32]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/150.0 MB 2.0 MB/s eta 0:01:16
   ---------------------------------------- 1.3/150.0 MB 2.2 MB/s eta 0:01:09
    --------------------------------------- 2.1/150.0 MB 2.6 MB/s eta 0:00:58
    --------------------------------------- 2.9/150.0 MB 2.9 MB/s eta 0:00:51
    --------------------------------------- 3.7/150.0 MB 3.0 MB/s eta 0:00:49
   - -------------------------------------- 4.5/150.0 MB 3.1 MB/s eta 0:00:48
   - -------------------------------------- 5.2/150.0 MB 3.2 MB/s eta 0:00:45
   - -----------------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
