# Database link :
https://www.kaggle.com/datasets/aravinii/house-price-prediction-treated-dataset

### Reading the csv files and a quick consultation

In [10]:
import pandas as pd
# Load the data
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [11]:
df_train.head()

Unnamed: 0,date,price,bedrooms,grade,has_basement,living_in_m2,renovated,nice_view,perfect_condition,real_bathrooms,has_lavatory,single_floor,month,quartile_zone
0,2014-06-20,237000.0,3,7,False,120.7739,False,False,False,1,False,True,6,2
1,2015-04-20,1328000.0,5,9,True,310.29602,True,True,False,3,False,True,4,4
2,2014-08-13,790500.0,4,9,False,311.22505,False,False,False,2,True,False,8,4
3,2015-04-08,431000.0,4,9,False,282.42512,False,False,False,2,True,False,4,2
4,2015-02-11,199000.0,3,7,True,162.58025,False,False,False,2,True,True,2,2


In [8]:
df_train.isnull().sum()

date                 0
price                0
bedrooms             0
grade                0
has_basement         0
living_in_m2         0
renovated            0
nice_view            0
perfect_condition    0
real_bathrooms       0
has_lavatory         0
single_floor         0
month                0
quartile_zone        0
dtype: int64

In [9]:
df_test.isnull().sum()

date                 0
price                0
bedrooms             0
grade                0
has_basement         0
living_in_m2         0
renovated            0
nice_view            0
perfect_condition    0
real_bathrooms       0
has_lavatory         0
single_floor         0
month                0
quartile_zone        0
dtype: int64

### Preparing the data and training the models

In [6]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
# Separate features and target variable
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']
X_test = df_test.drop('price', axis=1)
y_test = df_test['price']
# Preprocess the data
# Convert date to datetime and extract year and month
for df in [X_train, X_test]:
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df.drop('date', axis=1, inplace=True)
# One-hot encode categorical variables
categorical_cols = ['grade', 'month', 'quartile_zone']
X_train = pd.get_dummies(X_train, columns=categorical_cols)
X_test = pd.get_dummies(X_test, columns=categorical_cols)
# Ensure X_test has all columns present in X_train
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
# Align the columns of X_test with X_train
X_test = X_test[X_train.columns]
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBOOST': xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05, 
    max_depth=6, 
    subsample=0.8,
    colsample_bytree=0.8, 
    random_state=42
)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2, 'model': model}

# Display results
print("Model Performance:")
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']:.2f}")
    print(f"  R2: {metrics['R2']:.2f}")
    print()

# Find the best model
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = results[best_model_name]['model']
print(f"The best performing model is: {best_model_name}")



  model = cd_fast.enet_coordinate_descent(


Model Performance:
Linear Regression:
  MSE: 120864.25
  R2: 0.76

Ridge Regression:
  MSE: 120864.98
  R2: 0.76

Lasso Regression:
  MSE: 120864.84
  R2: 0.76

Decision Tree:
  MSE: 167894.34
  R2: 0.54

Random Forest:
  MSE: 123190.12
  R2: 0.75

XGBOOST:
  MSE: 119770.75
  R2: 0.76

The best performing model is: XGBOOST


### Saving the best model , the scaler and the feature names

In [7]:
import joblib
# Save the best model
joblib.dump(best_model, 'xgb_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(X_train.columns.tolist(), 'feature_names.joblib')

print("Best model, scaler, and feature names have been saved.")

Best model, scaler, and feature names have been saved.
