<a href="https://www.kaggle.com/code/youssefatourqui/predict-house-prices-in-ames-and-lowa?scriptVersionId=261803303" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col="Id")
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col="Id")

df_train.shape  , df_test.shape

In [None]:
df_test['test'] = 1
df_train['test'] = 0

df = pd.concat([df_train, df_test], sort = False)
df.head()

# check the correlations between numerics features and choose the features that have high correlation with the target feature (SalePrice)

In [None]:
def visualize_numeric_correlations(df):
    
    numeric_features = df.select_dtypes(include='number')
    
    
    if numeric_features.empty:
        raise ValueError("No numeric features found in the DataFrame.")
    
    
    correlation_matrix = numeric_features.corr()
    
    
    plt.figure(figsize=(25, 15))
    
    
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
    
    
    plt.title('Correlation Matrix of Numeric Features', fontsize=16)
   
    plt.tight_layout()  
    
    
    plt.show()

In [None]:
visualize_numeric_correlations(df_train)


**the most important features that correlated with SalePrice is :** 
- OverallQual
- GrLivArea
- GarageCars
- GarageArea
- TotalBsmtSF
- 1stFlrSF
- FullBath
- TotRmsAbvGrd
- YearBuilt
- YearRemodAdd
  
**the features that highly correlated with each others is :**
- TotalBsmtSF * 1stFlrSF
- GrLivArea * TotRmsAbvGrd
- GarageYrBlt * YearBuilt
- GarageCars * GarageArea


# investigate the relation of most important numeric features with saleprice

In [None]:
features = ['OverallQual' , 'GrLivArea' ,'GarageCars' ,'GarageArea' ,'TotalBsmtSF' ,'1stFlrSF' , 'FullBath' , 'TotRmsAbvGrd' , 'YearBuilt' , 'YearRemodAdd']
num_features = len(features)
num_rows = (num_features + 2) // 3  # Calculate number of rows needed
fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(18, 5 * num_rows))

# Flatten axes array for easy iteration
axes = axes.flatten()

# Create scatter plots
for i, feature in enumerate(features):
    sns.scatterplot(x=df[feature], y=df['SalePrice'], ax=axes[i])
    axes[i].set_title(f'SalePrice vs {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('SalePrice')
    axes[i].grid(True)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

# drop some extreme value in the features that have strong correlation with SalePrice

In [None]:
df_train[df_train.GrLivArea > 4500]

In [None]:
df_train[df_train.TotalBsmtSF > 6000]

In [None]:
df_train[df_train['1stFlrSF'] > 4000]

In [None]:
df_train = df_train.drop(index = [524,1299])
df_train.shape

# now we will deal with messing values in train and test dataset

In [None]:
df_train.isnull().sum().sort_values(ascending = False).head(22)

In [None]:
df['PoolQC'] = df['PoolQC'].fillna('No')

In [None]:
df['MiscFeature'] = df['MiscFeature'].fillna('NoMisc')

In [None]:
df.Alley = df.Alley.fillna('NoAlley')

In [None]:
df.Fence = df.Fence.fillna('No')

In [None]:
df.MasVnrType = df.MasVnrType.fillna('NoMasonry')

In [None]:
df.FireplaceQu = df.FireplaceQu.fillna('No')

In [None]:
df_train[df_train.LotFrontage == 0]

we can assume that NaN values in LotFrontage refer to 0, because the feature dosen't have 0 value.

In [None]:
df.LotFrontage = df.LotFrontage.fillna(0)

In [None]:
df.GarageYrBlt = df.GarageYrBlt.fillna('NO')

In [None]:
df.GarageType = df.GarageType.fillna('NoGarage')

In [None]:
df.GarageFinish = df.GarageFinish.fillna('NOG')

In [None]:
df.GarageCond = df.GarageCond.fillna('No')

In [None]:
df.GarageQual = df.GarageQual.fillna('No')

In [None]:
df.BsmtFinType2 = df.BsmtFinType2.fillna('No')
df.BsmtExposure  = df.BsmtExposure.fillna('NoB')
df.BsmtCond  = df.BsmtCond.fillna('No')  
df.BsmtQual   =  df.BsmtQual.fillna('No') 
df.BsmtFinType1 = df.BsmtFinType1.fillna('No')

In [None]:
df.MasVnrArea = df.MasVnrArea.fillna(0)

In [None]:
df.dropna(subset = ['Electrical'] , inplace = True)

In [None]:
df.isna().sum().sort_values(ascending = False).head(20)

In [None]:
df.MSZoning.value_counts()

In [None]:
df.MSZoning = df.MSZoning.fillna('RL')

In [None]:
df[['GarageCars' ,'GarageArea','GarageType','GarageFinish','GarageQual']][df.GarageCars.isna() == True]

In [None]:
df.GarageCars = df.GarageCars.fillna(0)
df.GarageArea = df.GarageArea.fillna(0)

In [None]:
df[['BsmtFullBath' , 'BsmtHalfBath' , 'BsmtFinType1' , 'BsmtFinType2']][df.BsmtFullBath.isna() == True]

In [None]:
df.BsmtFullBath = df.BsmtFullBath.fillna(0)
df.BsmtHalfBath = df.BsmtHalfBath.fillna(0)

In [None]:
df[['Utilities' , 'Functional']][df.Utilities.isna() == True]

In [None]:
df.Functional.value_counts()

In [None]:
df.Functional = df.Functional.fillna('Typ')

In [None]:
df.Utilities.value_counts()

In [None]:
df.Utilities = df.Utilities.fillna('AllPub')

In [None]:
df[['TotalBsmtSF' , 'BsmtUnfSF' ,'BsmtFinSF2', 'BsmtFinSF1', 'BsmtFinType1' , 'BsmtFinType2']][df.TotalBsmtSF.isna() == True]

In [None]:
df.TotalBsmtSF = df.TotalBsmtSF.fillna(0)
df.BsmtUnfSF = df.BsmtUnfSF.fillna(0)
df.BsmtFinSF2 = df.BsmtFinSF2.fillna(0)
df.BsmtFinSF1 = df.BsmtFinSF1.fillna(0)

In [None]:
df[['KitchenQual' ,'KitchenAbvGr' ]][df.KitchenQual.isna() == True]

In [None]:
df.KitchenQual.value_counts()

In [None]:
df.KitchenQual = df.KitchenQual.fillna('TA')

In [None]:
df[['Exterior1st','Exterior2nd','ExterQual']][df.Exterior2nd.isna()==True]

In [None]:
df[['Exterior1st','Exterior2nd']][df.ExterQual == 'TA'].value_counts()

In [None]:
df.Exterior1st = df.Exterior1st.fillna('MetalSd')
df.Exterior2nd = df.Exterior2nd.fillna('MetalSd')

In [None]:
df.SaleType.value_counts()

In [None]:
df.SaleType = df.SaleType.fillna('WD')

all of the rest NAN values is in test dataset so we can use algorithms that can handle missing values internally

# deal with ordinal categorical features

In [None]:
label_mapping_5lvl = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Average/Typical
    'Fa': 2,  # Fair
    'Po': 1   # Poor
}

for column in ['ExterQual','ExterCond' , 'HeatingQC' , 'KitchenQual']:
    df[column] = df[column].map(label_mapping_5lvl)


In [None]:
custom_map = {
        'Ex' :5	,   #Excellent 	
        'Gd' :4	,   #Good 
        'TA' :3 ,   #Typical 
        'Fa' :2	,   #Fair 
        'Po' :1	,   #Poor 
        'No' :0	   #No Basement
}
for column in ['BsmtQual' , 'BsmtCond','FireplaceQu','GarageQual' , 'GarageCond']:
    df[column] = df[column].map(custom_map)


In [None]:
custom_map2 = {
        'GLQ' : 6,   #Good Living Quarters
        'ALQ' :	5,   #Average Living Quarters
        'BLQ' :	4,   #Below Average Living Quarters	
        'Rec' :	3,   #Average Rec Room
        'LwQ' : 2,	 #Low Quality
        'Unf' : 1,	 #Unfinshed
        'No' :  0	 #No Basement
}
for column in ['BsmtFinType1' , 'BsmtFinType2']:
    df[column] = df[column].map(custom_map2)

In [None]:
df.BsmtExposure = df.BsmtExposure.map({
                                           'Gd'	:4,   #Good Exposure
                                           'Av'	:3,   #Average Exposure (split levels or foyers typically score average or above)	
                                           'Mn'	:2,   #Mimimum Exposure
                                           'No'	:1,   #No Exposure
                                           'NoB':0,	  #No Basement
})

In [None]:
df.Functional  = df.Functional.map({
                                       'Typ':8,	    #Typical Functionality
                                       'Min1':7,	#Minor Deductions 1
                                       'Min2':6,	#Minor Deductions 2
                                       'Mod':5,	    #Moderate Deductions
                                       'Maj1':4,	#Major Deductions 1
                                       'Maj2':3,	#Major Deductions 2
                                       'Sev':2,	    #Severely Damaged
                                       'Sal':1	    #Salvage only
})

In [None]:
df.PoolQC = df.PoolQC.map({
                               'Ex' : 4,	#Excellent
                               'Gd'	: 3,    #Good
                               'TA'	: 2,    #Average/Typical
                               'Fa'	: 1,    #Fair
                               'No'	: 0     #No Pool
})

In [None]:
df.Fence = df.Fence.map({
                           'GdPrv':4,	#Good Privacy
                           'MnPrv':3,	#Minimum Privacy
                           'GdWo':2,	#Good Wood
                           'MnWw':1,	#Minimum Wood/Wire
                           'No':0	    #No Fence
})

# deal with features that have high correlation 

now for the features that highly correlated we can drop the one that have less correlation with SalePrice or combine them in 1 feature

or we can do PCA 

TotalBsmtSF * 1stFlrSF

GrLivArea * TotRmsAbvGrd

GarageYrBlt * YearBuilt

GarageCars * GarageArea

-first we will drop 1 feayure 

-after that we will apply Pca to see if there is an improvement on the score

In [None]:
df.TotalArea = df.TotalBsmtSF + df['1stFlrSF'] + df['2ndFlrSF']
df.drop(columns = ['TotalBsmtSF' , '1stFlrSF' , '2ndFlrSF'],inplace =True)

In [None]:
# we can keep just GrLivArea that have higher correlation with SalePrice
df.drop(columns = ['TotRmsAbvGrd'] , inplace =True)

In [None]:
# we can keep YearBuilt that have higher correlation and seem more important than GarageYrBlt
df.drop(columns = ['GarageYrBlt'] , inplace =True)

In [None]:
# also here we can keep GarageCars that have higher correlation with SalePrice
df.drop(columns = ['GarageArea'] , inplace =True)

now its time to do one hot encoding for non ordinal categorical features

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
object_columns = df.select_dtypes(include=['object']).columns.tolist()
object_columns

In [None]:
for col in object_columns:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)

df = df.drop(columns = object_columns)

In [None]:
df.info()

In [None]:
df_train = df[df.test == 0]
df_test = df[df.test == 1]
df_train.drop(columns = ['test'],inplace=True)
df_test.drop(columns = ['test' ,'SalePrice'],inplace=True)
df_train.shape , df_test.shape

# Model Building

we will train multiple models and we will chose the model that have the most MSE score 
for that model we will do hyperparameter tunning

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score ,GridSearchCV

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
import optuna

In [None]:
X = df_train.drop('SalePrice', axis=1)  # Features
y = df_train['SalePrice']  # Target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    'max_depth': [4 , 5 , 6],  
    'learning_rate': [0.01 , 0.05 , 0.1] ,
    'n_estimators': [1000 , 2000 ,3000],
    'min_child_weight': [1,2,3],  
    'colsample_bytree': [0.7],  
    'subsample': [0.7],  
    'reg_alpha': [ 0.5 ],  
    'reg_lambda': [1.0 ], 
    'num_parallel_tree': [1] 
}

# Create the XGBRegressor
xgb = XGBRegressor()

# Set up Grid Search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)


grid_search.fit(X, np.log(y))

In [None]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)




In [None]:
gs_predictions = np.exp(grid_search.predict(df_test))

output = pd.DataFrame({'Id': df_test.index, 'SalePrice': gs_predictions})
output.to_csv('my_submission_grid_XGBoost.csv', index=False)