In [None]:
import pandas as pd
import numpy as np
from math import sqrt
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')


In [None]:

df = [train,test]
df_combined = pd.concat(df).reset_index(drop=True)

In [None]:
train.isnull().sum()

In [None]:
columns_to_drop=['Id','Alley','PoolQC','Fence','MiscFeature','FireplaceQu','MasVnrType']
df_combined.drop(columns=columns_to_drop, inplace=True)

In [None]:
df_combined.isnull().sum()

In [None]:
object_columns = train.select_dtypes(include=['object']).columns.tolist()
integer_columns = train.select_dtypes(include=['int64']).columns.tolist()
float_columns = train.select_dtypes(include=['float64']).columns.tolist()
print("Objects : ",object_columns)
print("integers :",integer_columns)
print("float :",float_columns)



In [None]:
object_features_to_impute=['Utilities','MSZoning','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Exterior1st','Electrical','Exterior2nd','KitchenQual', 'Functional','GarageType','GarageFinish','GarageQual', 'GarageCond','SaleType']
float_features_to_impute=['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath', 'BsmtHalfBath','GarageYrBlt', 'GarageCars', 'GarageArea']

In [None]:
for feature in float_features_to_impute:
    df_combined[feature].fillna(df_combined[feature].mean(), inplace=True)


for feature in object_features_to_impute:
    mode_val = df_combined[feature].mode()[0]
    df_combined[feature].fillna(mode_val, inplace=True)


In [None]:
df_combined.isnull().sum()

In [None]:
df_combined.info()

In [None]:
object_columns = df_combined.select_dtypes(include=['object']).columns.tolist()
integer_columns = df_combined.select_dtypes(include=['int64']).columns.tolist()
float_columns = df_combined.select_dtypes(include=['float64']).columns.tolist()



In [None]:

OH_encoder = OneHotEncoder(sparse=False)
OH_encoder.fit(df_combined[object_columns])

OH_feature_names = OH_encoder.get_feature_names_out(object_columns)

OH_cols = pd.DataFrame(OH_encoder.transform(df_combined[object_columns]), columns=OH_feature_names, index=df_combined.index)

df_final = df_combined.drop(object_columns, axis=1)

df_final = pd.concat([df_final, OH_cols], axis=1)


In [None]:
df_final.head()

In [None]:
print('df_final shape:', df_final.shape)
print('df_train shape:', train.shape)
print('df_test shape:',  test.shape)

X_Train = pd.DataFrame(df_final[:1460])
X_Test  = pd.DataFrame(df_final[1460:])
Y_Train = train['SalePrice']

print('\nCheck that the datasets are consistent:\n')
print('X_train shape', X_Train.shape)
print('Y_train shape:', Y_Train.shape)
print('X_test shape:',  X_Test.shape)


In [None]:
X_Train.drop(columns=['SalePrice'],inplace=True)
X_Test.drop(columns=['SalePrice'], inplace=True)

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_Train, Y_Train, train_size=0.8, test_size=0.2,random_state=0)

In [None]:

models = {
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(silent=True)
}


param_grids = {
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 4, 5]
    }
}


results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, Y_train)
    best_model = grid_search.best_estimator_


    y_pred = best_model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(Y_valid, y_pred))
    results[model_name] = rmse
    print(f"{model_name}: RMSE = {rmse}")


best_model_name = min(results, key=results.get)
best_model = grid_search.best_estimator_
print(f"Best model: {best_model_name}")


best_model.fit(X_train, Y_train)


test_predictions = best_model.predict(X_Test)

In [None]:
submission_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': test_predictions})

submission_df.to_csv('submission.csv', index=False)