In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Load dataset
data = pd.read_csv("allcities.csv")

# Features and target
X = data.drop(columns=['price'])
y = data['price']

# Preprocessing pipeline
numeric_features = ['bedroom', 'bathroom', 'area']
numeric_transformer = StandardScaler()

categorical_features = ['seller_type', 'layout_type', 'property_type', 'locality', 'furnish_type', 'city']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Cross-validation setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate each model
for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    rmse_scores = []
    mae_scores = []
    r2_scores = []
    
    for train_index, test_index in kfold.split(X):
        # Split data into training and test sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Fit the model
        pipeline.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = pipeline.predict(X_test)
        
        # Calculate evaluation metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Store the metrics for this fold
        rmse_scores.append(rmse)
        mae_scores.append(mae)
        r2_scores.append(r2)
    
    # Calculate average metrics across all folds
    avg_rmse = np.mean(rmse_scores)
    avg_mae = np.mean(mae_scores)
    avg_r2 = np.mean(r2_scores)

    # Print average metrics for each model
    print(f"{model_name} Cross-Validation Results:")
    print(f"Average RMSE: {avg_rmse}")
    print(f"Average MAE: {avg_mae}")
    print(f"Average R^2: {avg_r2}")
    print("-" * 50)


Linear Regression Cross-Validation Results:
Average RMSE: 43870.05441889945
Average MAE: 18836.462555444046
Average R^2: 0.7715325350897894
--------------------------------------------------
Random Forest Cross-Validation Results:
Average RMSE: 32220.50760084095
Average MAE: 7401.803746489661
Average R^2: 0.8769656297604005
--------------------------------------------------
XGBoost Cross-Validation Results:
Average RMSE: 32889.66796504127
Average MAE: 10776.715195539256
Average R^2: 0.8715321827990454
--------------------------------------------------
