In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
print(train_data.shape)
print(test_data.shape)
train_data.head()

In [None]:
df=pd.concat([train_data.drop('SalePrice',axis=1),test_data])
y = train_data['SalePrice']

### 1) Data Cleaning

In [None]:
# Drop ID Column axis=1
df.drop('Id',axis=1, inplace=True)

In [None]:
# convert MSSubclass data type to str 'O' as 'Object'
print(df.MSSubClass.dtype)
df.MSSubClass=df.MSSubClass.astype('str')
print(df.MSSubClass.dtype)


### 2) handle Missing values

#### 2.1) categorical Missing Values

In [None]:
# check missing data in "object" data type
df.select_dtypes('O').isnull().sum().sort_values(ascending=False)

In [None]:
#filling the features which Na has meaning in it 
# eg. house doesnt have garage 
miss1 =['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType'
   ,'GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
for column in miss1:
    df[column] = df[column].fillna("None")

In [None]:
# check missing data in "object" data type
df.select_dtypes('O').isnull().sum().sort_values(ascending=False)

In [None]:
#filling the other cat features with the most frequent value
miss2 =['MasVnrType','MSZoning','Functional','Utilities','SaleType','Exterior2nd','Exterior1st',
         'Electrical' ,'KitchenQual']
for column in miss2:
    df[column] = df[column].fillna(df[column].mode()[0])

#### 2.2) Numerical Missing Values

In [None]:
"""
now for the numerical Missing Value imputation we will use KNN Imputer 
 simply we will use KNN model and train it on the data which not having nulls 
and then use the trained model to predict the na values based on the nearest k neighbors

also, we could have use something like median and mode to replace the missing values
"""
from sklearn.neighbors import KNeighborsRegressor

def _KNN_(data,col):
    
    #pick Numerical values 
    numerical_data=df.select_dtypes(exclude='O')
    # select the clean columns
    clean_col =numerical_data.isna().sum()[numerical_data.isna().sum()==0].index

    X_train= numerical_data[clean_col][numerical_data[col].isna()==0]
    y_train= numerical_data[col][numerical_data[col].isna()==0]
    
    X_test=numerical_data[clean_col][numerical_data[col].isna()==1]
    
    KNN=KNeighborsRegressor(n_neighbors=5)
    KNN.fit(X_train,y_train)
    
    y_pred=KNN.predict(X_test)
    
    data[col][data[col].isna()==1]=y_pred
    
    
    return data


In [None]:
# check missing data in "object" data type
df.select_dtypes(exclude='O').isnull().sum().sort_values(ascending=False)

In [None]:
num_f=['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath',
    'BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea']
for col in num_f:
    clean_df=_KNN_(df,col)

In [None]:
# Now the data is clean and have no missing values
clean_df.isna().sum().sum()

### 3) Feature Engineering

In [None]:
#LotFrontage: Linear feet of street connected to property
#LotArea: Lot size in square feet

# we can combine the both features to get the total area of both 
clean_df['TotalArea']=clean_df['LotFrontage']+clean_df['LotArea']

In [None]:
#OverallQual: Rates the overall material and finish of the house
#OverallCond: Rates the overall condition of the house

clean_df['Total_Home_Quality'] = clean_df['OverallQual'] + clean_df['OverallCond']

In [None]:
'''BsmtFullBath: Basement full bathrooms

BsmtHalfBath: Basement half bathrooms

FullBath: Full bathrooms above grade

HalfBath: Half baths above grade'''

clean_df['Total_Bathrooms'] = (clean_df['FullBath'] + (0.5 * clean_df['HalfBath']) +
                               clean_df['BsmtFullBath'] + (0.5 * clean_df['BsmtHalfBath']))

In [None]:
clean_df["AllSF"] = clean_df["GrLivArea"] + clean_df["TotalBsmtSF"]

clean_df["AvgSqFtPerRoom"] = clean_df["GrLivArea"] / (clean_df["TotRmsAbvGrd"] +
                                                       clean_df["FullBath"] +
                                                       clean_df["HalfBath"] +
                                                       clean_df["KitchenAbvGr"])

clean_df["totalFlrSF"] = clean_df["1stFlrSF"] + clean_df["2ndFlrSF"]

In [None]:
clean_df.shape

### 4) Feature Transformation

#### Handling Skewed Data 
- Many machine learning algorithms assume that the features are normally distributed. If a feature is highly skewed (not symmetrically distributed), it might be beneficial to apply a log transformation.
- The log transformation compresses the range of large values and expands the range of small values. This can be particularly useful when dealing with features that have a wide range of magnitudes.

In [None]:
# Determine the columns with skew > 0.5
numerical_data=clean_df.select_dtypes(exclude='O')

high_skew_cols = numerical_data.skew()[abs(numerical_data.skew()) > 0.5].index
high_skew_cols

In [None]:
# skewed (not normally distributed)
sns.histplot(df['LotFrontage'], kde=True)

In [None]:
# Apply log transformation 
for col in high_skew_cols:
    clean_df[col]= np.log1p(clean_df[col])

In [None]:
# distibuted data 
sns.histplot(df['LotFrontage'], kde=True)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.histplot(y, kde=True)
plt.title("NO Log Transform")
log_y = np.log(y)
plt.subplot(1, 2, 2)
sns.histplot(log_y, kde=True)
plt.xlabel("Log SalePrice")
plt.title("Log Transform")

plt.show()

#### One Hot Encoding

In [None]:
print(clean_df.shape)
df1=pd.get_dummies(clean_df, drop_first=True)
df1.replace({False: 0, True: 1}, inplace=True)
print(df1.shape)
df1.head()

#### Standarization  
- a common preprocessing step in machine learning to ensure that all features have the same scale, which can be important for some algorithms.
- It standardizes the features so that they have a mean of 0 and a standard deviation of 1.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df1)

scaled_df = pd.DataFrame(scaler.transform(df1), index=df1.index, columns=df1.columns)
scaled_df.head()

### 5) Split the train and Test data again

In [None]:
X_train=scaled_df.iloc[:1460, :]
X_test=scaled_df.iloc[-1459:, :]

## Model Selection and Training

 we will setup **PyCaret environment** for regression, then compares the performance of various regression models to help you quickly identify models that perform well on your specific dataset.

In [None]:
path_to_dir_packages = "./packages"
!mkdir -p {path_to_dir_packages}

In [None]:
!pip download pycaret -d {path_to_dir_packages}
!pip download poetry-core -d {path_to_dir_packages}
!pip download oldest-supported-numpy -d {path_to_dir_packages}

In [None]:
!pip install --no-index --find-links=packages pycaret

## Model selection using Pycarpet
1)  **Setup()**: This function initializes the PyCaret environment for regression tasks. It automatically performs various preprocessing steps, such as handling missing values, encoding categorical variables, and splitting the data into training and testing sets.

2) **compare_models()**: his function compares the performance of different regression models available in PyCaret. It trains each model on the training data and evaluates their performance using default metrics. The comparison results, including various metrics and plots, are then displayed.

In [None]:
from pycaret.regression import setup, compare_models
_ = setup(data=pd.concat([X_train, log_y], axis=1), target='SalePrice')

In [None]:
compare_models()

it seems that CatBoost Model is the best choice

In [None]:
from catboost import CatBoostRegressor

cat_boost=CatBoostRegressor()
cat_boost.fit(X_train,log_y);

## Predict and Submit 

In [None]:
cat_boost_prediction=np.exp(cat_boost.predict(X_test))
cat_boost_prediction

In [None]:
cat_boost_submission = pd.concat([test_data['Id'], pd.Series(cat_boost_prediction, name='SalePrice')], axis=1)
cat_boost_submission.head()

In [None]:
cat_boost_submission.to_csv('cat_boost_submission.csv', index=False, header=True)

#### Score = 0.12174

## Ensemble

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor  

models = {
    "catboost": CatBoostRegressor(verbose=0),
    "gbr": GradientBoostingRegressor(),
    "lightgbm": LGBMRegressor(),
    "et": ExtraTreesRegressor(),
    "rf": RandomForestRegressor(),
    'xgboost': XGBRegressor()
}

In [None]:
for name, model in models.items():
    model.fit(X_train, log_y)
    print(name + " trained.")

#### Predict and submit

In [None]:
predictions = (
    0.4 * np.exp(models['catboost'].predict(X_test)) +
    0.2 * np.exp(models['gbr'].predict(X_test)) +
    0.1 * np.exp(models['lightgbm'].predict(X_test)) +
    0.1 * np.exp(models['et'].predict(X_test)) +
    0.1 * np.exp(models['rf'].predict(X_test))+
    0.1 * np.exp(models['xgboost'].predict(X_test))
)

In [None]:
submission = pd.concat([test_data['Id'], pd.Series(predictions, name='SalePrice')], axis=1)
submission

In [None]:
submission.to_csv('./ensemble_submission3.csv', index=False, header=True)

#### Ensemble Score = 0.12359