In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#load required libraries
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns',None)

# Load Data

In [None]:
df_train=pd.read_csv('train.csv')
df_train.head()

In [None]:
df_test=pd.read_csv('test.csv')
df_test.head()

In [None]:
df_train.shape, df_test.shape

**Train** data has 1460 rows and 81 columns AND **Test** data has 1459 rows and 80 columns

# Data Cleaning

In [None]:
#Function to get count of missing values in each column
def get_cols_with_missing_values(DataFrame):
    missing_na_columns=(DataFrame.isnull().sum())
    return missing_na_columns[missing_na_columns > 0]

In [None]:
#Missing values in train data
feature_with_na = get_cols_with_missing_values(df_train)
print(feature_with_na)

**Observation**: Target feature 'SalePrice' has no missing value

In [None]:
#Missing values in test data
feature_with_na = get_cols_with_missing_values(df_test)
print(feature_with_na)

In [None]:
#distribution of values in target feature
sns.distplot(df_train.get("SalePrice"), kde=False)
plt.show()

**Observation**: The dependent feature 'SalePrice' is right skewed

In [None]:
# find outliers for all the numerical dataset (before handling missing values)
numerical_df = df_train.select_dtypes(exclude=['object'])
numerical_df = numerical_df.drop(["Id"], axis=1)
for column in numerical_df:
    plt.figure(figsize=(16, 4))
    sns.set_theme(style="whitegrid")
    sns.boxplot(numerical_df[column])

In [None]:
#features present in train data that are not present in test data
feature_train_not_test = [col for col in df_train.columns if col not in df_test.columns and col != 'SalePrice']
print(feature_train_not_test)

In [None]:
#features present in test data that are not present in train data
feature_test_not_train = [col for col in df_test.columns if col not in df_train.columns]
print(feature_test_not_train)

**Observation**: Train and Test data have the exact same features (not considering the target feature 'SalePrice')

In [None]:
#combine train and test data for data preprocessing
df_merge=pd.concat([df_test.assign(ind="test"), df_train.assign(ind="train")])
df_merge.head()

In [None]:
df_merge.info()

## Impute missing Categorical features

In [None]:
#Get a list of all the categorical features that have the keyword 'Qual' OR 'Cond' OR 'Qu' OR 'QC' in the feature name
feature_rating_Qual = [col for col in df_merge.columns if 'Qual' in col and df_merge[col].dtypes=='object']
feature_rating_Cond = [col for col in df_merge.columns if 'Cond' in col and col not in ['Condition1', 'Condition2', 'SaleCondition'] and df_merge[col].dtypes=='object']
feature_rating_Qu = [col for col in df_merge.columns if 'Qu' in col and df_merge[col].dtypes=='object' and col not in feature_rating_Qual]
feature_rating_QC = [col for col in df_merge.columns if 'QC' in col and df_merge[col].dtypes=='object']

cat_feature_with_rating = feature_rating_Qual + feature_rating_Cond + feature_rating_Qu + feature_rating_QC

for x in cat_feature_with_rating:
    print(x)

In [None]:
#Categorical features who have NA as a correct value
cat_feature_with_legit_na = ['Alley', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'Fence', 'MiscFeature']
df_merge[cat_feature_with_legit_na].head()

In [None]:
ordinal_cat_features = cat_feature_with_rating + cat_feature_with_legit_na
df_merge[ordinal_cat_features].head()

In [None]:
#On checking the data description, Missing is valid in some categorical
#Handling Missing Values in Ordinal Categorical features by replacing them with 'Missing' keyword
df_merge[ordinal_cat_features] = df_merge[ordinal_cat_features].fillna("Missing")

#Making sure the missing values have been handled
print(get_cols_with_missing_values(df_merge[ordinal_cat_features]))

In [None]:
#Get categorical and numerical columns
categorical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes=='object' and cname!='ind']
remaining_cat_cols = [cname for cname in categorical_cols if cname not in ordinal_cat_features]

numerical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes!='object' and cname!='SalePrice']

In [None]:
df_merge[remaining_cat_cols].head()

In [None]:
#Handling Missing Values in Categorical features by replacing them with the feature mode value
for col in remaining_cat_cols:
    df_merge[col] = df_merge[col].fillna(df_merge[col].mode()[0])  


#Making sure the missing values have been handled
print(get_cols_with_missing_values(df_merge[remaining_cat_cols]))

## Impute missing Numerical features

In [None]:
#Handling Missing Values in Numerical features by replacing them with Mean value
df_merge[numerical_cols]=df_merge[numerical_cols].fillna(df_merge[numerical_cols].mean())

#Making sure the missing values have been handled
print(get_cols_with_missing_values(df_merge[numerical_cols]))

In [None]:
#Select categorical columns with low cardinality
categorical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes=='object' and df_merge[cname].nunique()<10]
numerical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes!='object']

# Keep selected columns only
my_cols = numerical_cols + categorical_cols
df_merge_clean = df_merge[my_cols].copy()

In [None]:
print(get_cols_with_missing_values(df_merge_clean))

In [None]:
df_merge_clean.head()

# Feature Engineering

In [None]:
#Drop 'Id' feature
df_merge_clean.drop('Id', axis=1, inplace=True)

In [None]:
#Handling years
df_merge_clean['GarageYrBlt'] = df_merge_clean['GarageYrBlt'].astype('int')
df_merge_clean['GarageYrBlt'] = df_merge_clean['YrSold'] - df_merge_clean['GarageYrBlt']

df_merge_clean['YearBuilt'] = df_merge_clean['YrSold'] - df_merge_clean['YearBuilt']

df_merge_clean['YearRemodAdd'] = df_merge_clean['YrSold'] - df_merge_clean['YearRemodAdd']

df_merge_clean.drop(["YrSold"], axis=1, inplace=True)
df_merge_clean.drop(["MoSold"], axis=1, inplace=True)

In [None]:
#TotalBsmtSF(Total square feet of basement area) = BsmtFinSF1(Type 1 finished square feet) + BsmtFinSF2(Type 2 finished square feet) + BsmtUnfSF(Unfinished square feet of basement area)
df_merge_clean.drop(["TotalBsmtSF"], axis=1, inplace=True)

#Basement finished area
df_merge_clean['BsmtFinSF'] = df_merge_clean['BsmtFinSF1'] + df_merge_clean['BsmtFinSF2']
df_merge_clean.drop(["BsmtFinSF1"], axis=1, inplace=True)
df_merge_clean.drop(["BsmtFinSF2"], axis=1, inplace=True)

In [None]:
#Total floor square feet
df_merge_clean['TotalFlrSF'] = df_merge_clean['1stFlrSF'] + df_merge_clean['2ndFlrSF']
df_merge_clean.drop(["1stFlrSF"], axis=1, inplace=True)
df_merge_clean.drop(["2ndFlrSF"], axis=1, inplace=True)

In [None]:
#Total Bathrooms
df_merge_clean['Total_Bath'] = (df_merge_clean['FullBath'] + (0.5*df_merge_clean['HalfBath']) + df_merge_clean['BsmtFullBath'] + (0.5*df_merge_clean['BsmtHalfBath']))
df_merge_clean.drop(["FullBath"], axis=1, inplace=True)
df_merge_clean.drop(["HalfBath"], axis=1, inplace=True)
df_merge_clean.drop(["BsmtFullBath"], axis=1, inplace=True)
df_merge_clean.drop(["BsmtHalfBath"], axis=1, inplace=True)

# Feature Transformation

In [None]:
#Perform log normal distribution on the skewed nuerical features
import scipy.stats

numerical_cols = [cname for cname in df_merge_clean.columns if df_merge_clean[cname].dtypes!='object' and cname!='SalePrice']

skew_df = pd.DataFrame(numerical_cols, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(df_merge_clean[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)
skew_df

In [None]:
df_merge_clean[numerical_cols].describe()

**Observation**: The minimum value for few numerical features in 0. So we cannot apply log transformation, as the log(0) is infinity. So we will apply **lop1p** transformation

In [None]:
for column in skew_df.query("Skewed == True")['Feature'].values:
    df_merge_clean[column] = np.log1p(df_merge_clean[column])

# Encode Categorical features

In data description, the values given for 'OverallQual' feature are as follows- 10:Very Excellent ,9:Excellent ,8:Very Good ,7:Good ,6:Above Average ,5:Average ,4:Below Average ,3:Fair ,2:Poor ,1:Very Poor

So we will try to convert categorical features having qualities as per the above example

In [None]:
#Before encoding - features with rating
df_merge_clean[cat_feature_with_rating]

In [None]:
for col in cat_feature_with_rating:
    if 'Missing' in df_merge_clean[col].value_counts().index:
        df_merge_clean[col] = df_merge_clean[col].map({"Missing":0,"Po":1,"Fa":2,"TA":3,"Gd":4,"Ex":5})
    else:
        df_merge_clean[col] = df_merge_clean[col].map({"Po":1,"Fa":2,"TA":3,"Gd":4,"Ex":5})

In [None]:
#After encoding - features with rating
df_merge_clean[cat_feature_with_rating]

In [None]:
#features with legit na
df_merge_clean[cat_feature_with_legit_na]

In [None]:
#Exclude 'Alley', MiscFeature' and 'GarageType' feature as they are not ordinal
df_merge_clean['BsmtExposure'] = df_merge_clean['BsmtExposure'].map({"Missing":0,"No":1,"Mn":2,"Av":3,"Gd":4}).astype('int')
df_merge_clean['BsmtFinType1'] = df_merge_clean['BsmtFinType1'].map({"Missing":0,"Unf":1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}).astype('int')
df_merge_clean['BsmtFinType2'] = df_merge_clean['BsmtFinType2'].map({"Missing":0,"Unf":1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}).astype('int')
df_merge_clean['GarageFinish'] = df_merge_clean['GarageFinish'].map({"Missing":0,"Unf":1,"RFn":2,"Fin":3}).astype('int')
df_merge_clean['Fence'] = df_merge_clean['Fence'].map({"Missing":0,"MnWw":1,"GdWo":2,"MnPrv":3,"GdPrv":4}).astype('int')

In [None]:
#There are few other features where rank can be applied
df_merge_clean['LotShape'] = df_merge_clean['LotShape'].map({"IR3":1,"IR2":2,"IR1":3,"Reg":4}).astype('int')
df_merge_clean['LandContour'] = df_merge_clean['LandContour'].map({"Low":1,"Bnk":2,"HLS":3,"Lvl":4}).astype('int')
df_merge_clean['Utilities'] = df_merge_clean['Utilities'].map({"ELO":1,"NoSeWa":2,"NoSewr":3,"AllPub":4}).astype('int')
df_merge_clean['LandSlope'] = df_merge_clean['LandSlope'].map({"Sev":1,"Mod":2,"Gtl":3}).astype('int')
df_merge_clean['CentralAir'] = df_merge_clean['CentralAir'].map({"N":0,"Y":1}).astype('int')
df_merge_clean['PavedDrive'] = df_merge_clean['PavedDrive'].map({"N":0,"P":1,"Y":2}).astype('int')

In [None]:
#Categorical features remaining to be encoded
cat_remaining_to_encode = [col for col in df_merge_clean.columns if df_merge_clean[col].dtypes=='object' and col !='ind']

print(cat_remaining_to_encode)

In [None]:
df_merge_clean_dummies = pd.get_dummies(df_merge_clean[cat_remaining_to_encode],drop_first=True)

df_merge_clean.drop(cat_remaining_to_encode,axis=1,inplace=True)

df_merge_clean = pd.concat([df_merge_clean,df_merge_clean_dummies],axis=1)

In [None]:
columns_with_infinite_values = []

# Only check columns with numeric data types
numeric_columns = df_merge_clean.select_dtypes(include=[np.number]).columns

for column in numeric_columns:
    if np.isinf(df_merge_clean[column]).any():
        columns_with_infinite_values.append(column)

# Display columns that have infinite values
print("Columns with infinite values:", columns_with_infinite_values)

In [None]:
print(df_merge_clean['YearBuilt'].unique())

In [None]:
# Calculate the mean excluding infinities
mean_year_built = df_merge_clean['YearBuilt'].replace([np.inf, -np.inf], np.nan).mean()

# Replace infinite values with the mean
df_merge_clean['YearBuilt'] = df_merge_clean['YearBuilt'].replace([np.inf, -np.inf], mean_year_built)

In [None]:
df_merge_clean.columns

In [None]:
columns_to_include = [col for col in df_merge_clean if col != 'ind']

#Correlation Matrix 
corr_matrix = df_merge_clean[columns_to_include].corr()

plt.figure(figsize=(30,30))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap of Numeric Features')
plt.show

In [None]:
# Assuming your DataFrame is named df_merge_clean and the correlation matrix is stored in correlation_matrix

# Extract correlations with SalePrice and drop the SalePrice self-correlation
saleprice_correlations = corr_matrix['SalePrice'].drop('SalePrice')

# Sort the correlations by their absolute values in descending order
sorted_correlations = saleprice_correlations.abs().sort_values(ascending=False)

# Filter for high correlations based on a threshold, e.g., 0.5
high_correlations = sorted_correlations[sorted_correlations >= 0.5]

print("Features highly correlated with SalePrice:\n", high_correlations)

In [None]:
high_corr_features = corr_matrix.index[abs(corr_matrix['SalePrice'])>0.5]

df_high_corr = df_merge_clean[high_corr_features]

df_high_corr.head()

# Feature Scaling 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

features_to_be_scaled = [col for col in df_merge_clean.columns if col not in ['SalePrice', 'ind']]
scaler.fit(df_merge_clean[features_to_be_scaled])

df_merge_clean[features_to_be_scaled] = pd.DataFrame(scaler.transform(df_merge_clean[features_to_be_scaled]), index=df_merge_clean[features_to_be_scaled].index, columns=df_merge_clean[features_to_be_scaled].columns)

# Split Train and Test data

In [None]:
test, train= df_merge_clean[df_merge_clean["ind"].eq("test")], df_merge_clean[df_merge_clean["ind"].eq("train")]
test.drop(["SalePrice", "ind"], axis=1, inplace=True)
train.drop(["ind"], axis=1, inplace=True)

In [None]:
log_target = np.log(train['SalePrice'])
train.drop(["SalePrice"], axis=1, inplace=True)

In [None]:
train.shape, test.shape

In [None]:
X = train.loc[:, train.columns!='SalePrice']
y = df_train['SalePrice']
#TARGET feature encoding
y = np.log(df_train.SalePrice)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

# summarize
print('Train', X_train.shape, y_train.shape)
print('Validation', X_valid.shape, y_valid.shape)

# Build Model

In [None]:
'''from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

# A parameter grid for XGBoost
params = {
    'n_estimators':[100,200,400,600,800,900,1000],
    'learning_rate':[0.05,0.1,0.15],
    'min_child_weight':[i for i in range(1,5)], 
    'gamma':[0.1*i for i in range(1,10)],  
    'subsample':[0.1*i for i in range(6,11)],
    'colsample_bytree':[0.1*i for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
}

regressor = XGBRegressor()
#RandomizedSearchCV
random_search = RandomizedSearchCV(regressor, param_distributions=params,n_iter=5, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')

random_search.fit(X_train, y_train)'''

In [None]:
'''best_regressor = random_search.best_estimator_
print(best_regressor)'''

## Bagging Ensemble

In [None]:
#from xgboost import XGBRegressor
import catboost as cb
import optuna
from sklearn.metrics import mean_squared_error

In [None]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = cb.CatBoostRegressor(**params, silent=True)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, predictions, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

In [None]:
catboost_params = {
    'iterations': 5000,
    'learning_rate': 0.02,
    'depth': 4,
    'eval_metric':'RMSE',
    'early_stopping_rounds': 20
}

xgboost_params = {
    'n_estimators': 5000,
    'learning_rate': 0.02,
    'colsample_bytree': 0.5,
    'subsample': 0.5,
    'min_child_weight': 2,
    'early_stopping_rounds': 20   
}

In [None]:
models = {
    "catboost": CatBoostRegressor(**catboost_params, verbose=0),
    "xgb": XGBRegressor(**xgboost_params, verbose=0)
}

In [None]:
for name, model in models.items():
    model.fit(train, log_target)
    print(name + " trained.")

In [None]:
from sklearn.model_selection import KFold, cross_val_score
results = {}

kf = KFold(n_splits=10)

for name, model in models.items():
    result = np.exp(np.sqrt(-cross_val_score(model, train, log_target, scoring='neg_mean_squared_error', cv=kf)))
    results[name] = result

In [None]:
for name, result in results.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

In [None]:
from sklearn.metrics import mean_absolute_error
#Combine predictions
final_predictions = (
    0.5 * np.exp(models['catboost'].predict(test)) +
    0.5 * np.exp(models['xgb'].predict(test))
)

In [None]:
'''# Calculate MAE
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_valid)
rmse_pred = mean_absolute_error(y_valid, y_pred) 
print("Root Mean Absolute Error:" , np.sqrt(rmse_pred))'''

In [None]:
'''# Get test predictions
preds_test = model.predict(test)

final_predictions = np.exp(preds_test)'''

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': test.index+1461,
                       'SalePrice': final_predictions})
output.to_csv('submission.csv', index=False)