<a href="https://www.kaggle.com/code/ocanaydin/house-regression-num-categorical-features?scriptVersionId=113933975" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**GET TRAIN AND TEST DATA**

In [None]:
train_csv = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_csv = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train_x = train_csv
test_x = test_csv

train_Id = train_x["Id"]
test_Id = test_x["Id"]

train_x = train_x.drop(columns = ["Id"])
test_x = test_x.drop(columns = ["Id"])

**PREPROCESSING DATA**

***Check if dataset contains nan data.Then check if it contains categorical data.***

In [None]:
"""Check na datas."""
def check_na(data):
    nan_keys = []
    for key in data.keys():
        for i in range(len(data[key].isna())):
            if data[key].isna()[i]:
                nan_keys.append(key)
                break
    return nan_keys

In [None]:
"""Check categorical features."""
def check_categorical(data):
    categorical_keys = []
    for key in data.keys():
        if data[key].dtype == np.dtype('O'):
            categorical_keys.append(key)
    return categorical_keys


**VISUALIZATION OF DATA AS HISTOGRAM**

In [None]:
_fig = train_x.hist(figsize = (25,24),bins = 60,color = "red",edgecolor = "gray",xlabelsize = 10,ylabelsize = 10)

**NUMERICAL FEATURE SELECTION WAY**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
"""Get Correlation Matrix."""
corr_mat = train_x.corr()
k = 9
cols = corr_mat.nlargest(k,"SalePrice")["SalePrice"].index
cm = np.corrcoef(train_x[cols].values.T)
sns.set(font_scale = 1)
plt.subplots(figsize = (20,12))
heat_map = sns.heatmap(cm,cbar = True,annot = True,square = True,fmt = ".2f",annot_kws = {"size" : 10},yticklabels = cols.values,xticklabels = cols.values)
plt.show()

**USE ONLY POSITIVE CORRELATION COLUMNS,DROP IT OTHERS.**

In [None]:
train_x_numerical = train_x[cols]

In [None]:
"""Seperate SellPrice column from test."""
test_x_numerical = test_x[cols[1:]]

In [None]:
"""Detect value if it contains categorical data for train x."""
categorical_train_keys = check_categorical(train_x)
nan_train_keys = check_na(train_x)
print(f"Nan keys : {len(nan_train_keys)}\nCategorical keys : {len(categorical_train_keys)}")
print(categorical_train_keys)

In [None]:
"""Detect value if it contains categorical data for test x."""
categorical_test_keys = check_categorical(test_x)
nan_test_keys = check_na(test_x)
print(f"Nan keys : {len(nan_test_keys)}\nCategorical keys : {len(categorical_test_keys)}")

**CATEGORICAL FEATURE SELECTION**

In [None]:
train_categoric = train_x[categorical_train_keys]
print(categorical_train_keys)

In [None]:
fig,axes = plt.subplots(14,3,figsize = (25,40))
count = 0
for i in range(14):
    for j in range(3):  
        sns.countplot(x = train_categoric.columns[count],alpha = 0.7,data = train_categoric,ax = axes[i,j])
        count += 1
fig.tight_layout()

In [None]:
sns.countplot(x = "SaleCondition",alpha = 0.7,data = train_categoric)

**Now,we can select some categorical features that effects salePrice.(I choose features that contain not too many labels.)**

In [None]:
categorical_features = ["MSZoning","LotShape","LandContour","BldgType","HouseStyle","MasVnrType","ExterQual",
                       "BsmtQual","BsmtExposure","HeatingQC","KitchenQual","GarageFinish","PoolQC",
                        "SaleCondition"]
print(f"Len of categorical features : {len(categorical_features)}")

In [None]:
train_categoric = train_categoric[categorical_features]
test_categoric = test_x[categorical_features]
print(train_categoric.shape,test_categoric.shape)

**Concatenate both numeric and categorical features.**

In [None]:
train_x = pd.concat([train_x_numerical,train_categoric],axis = 1)
test_x = pd.concat([test_x_numerical,test_categoric],axis = 1)
print(train_x.shape,test_x.shape)
train_x.head()

**Fill na values.We can follow this way : (1)If column includes numeric datas ,we can fill NA values with mean.(2)If column includes categorical datas,we can fill NA values with the most iterative value.**

In [None]:
"""Now we can fill datas with following above way."""
"""For train x."""
for key in train_x.keys():
        #If it contains categorical data,fill it with most iterative value.
    if train_x[key].dtype == np.dtype('O'):
        train_x = train_x.fillna(value = train_x[key].value_counts().index[0])
        #If it all contains numerical data,fill it with its mean."""
    else:
        train_x = train_x.fillna(value = train_x[key].mean())
        print("Worked")
"""Now use check na function,if it returns an empty list,we can say we filled it."""
train_nan_keys = check_na(train_x)
print(train_nan_keys)
        
        
    


In [None]:

"""Fill na for test x."""
for key in test_x:
    if test_x[key].dtype == np.dtype('O'):
        test_x = test_x.fillna(value = test_x[key].value_counts().index[0])
    else:
        test_x = test_x.fillna(value = test_x[key].mean())
        print("Worked")
"""Now use check na function,if it returns an empty list,we can say we filled it."""
test_nan_keys = check_na(test_x)
print(test_nan_keys)
        


**Now we can convert all categorical values to numerical values.**


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
"""For train_x."""
for key in categorical_features:
    transformed_data = label_encoder.fit_transform(train_x[key].values.astype("str").ravel())
    train_x = train_x.drop(columns = [key])
    train_x = pd.concat([train_x,pd.DataFrame(transformed_data,columns = [key])],axis = 1)

print(check_categorical(train_x))
train_x.head()

In [None]:
"""For test_x."""
for key in categorical_features:
    transformed_data = label_encoder.fit_transform(test_x[key].values.astype("str").ravel())
    test_x = test_x.drop(columns = [key])
    test_x = pd.concat([test_x,pd.DataFrame(transformed_data,columns = [key])],axis = 1)

print(check_categorical(test_x))
test_x.head()

In [None]:
"""Seperate "SalePrice" column from train data and drop it.Then assign it as train_y. """
SalePrice = train_x["SalePrice"]
train_x = train_x.drop(columns=["SalePrice"])
train_y = pd.DataFrame(SalePrice,columns = ["SalePrice"])
train_y

**STANDARTIZATION of DATA**

In [None]:
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
"""Standartization of train and test data."""
scaled_train_x = SS.fit_transform(train_x)
scaled_test_x = SS.fit_transform(test_x)
"""Convert them to dataframe."""
scaled_train_x = pd.DataFrame(scaled_train_x,columns = train_x.columns)
scaled_test_x = pd.DataFrame(scaled_test_x,columns = test_x.columns)

**APPLY ISOMAP FOR NON-LINEAR DIMENSION REDUCTION(3D)**

In [None]:
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors = 5,n_components = 3)
iso_scaled_train_x = iso.fit_transform(scaled_train_x)
iso_scaled_test_x = iso.fit_transform(scaled_test_x)

iso_scaled_train_x = pd.DataFrame(iso_scaled_train_x,columns = ["ISO1","ISO2","ISO3"])
iso_scaled_test_x = pd.DataFrame(iso_scaled_test_x,columns = ["ISO1","ISO2","ISO3"])

print(iso_scaled_train_x)
print(iso_scaled_test_x)

**PCA of DATAS(5D)**

**FIT MODEL**

In [None]:
"""Multiple Linear Regression."""
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(scaled_train_x,train_y)
"""Gradient Boosting."""
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(random_state = 42,loss = "squared_error",n_estimators=100,learning_rate = 0.1)
GBR.fit(scaled_train_x,train_y.values.ravel())
"Xgboost algorithm."
XGB = xgb.XGBRegressor(booster = "gbtree",eta = 0.05,max_depth = 7,n_estimators = 200,gamma = 0.2,reg_lambda = 1)
XGB.fit(scaled_train_x,train_y.values.ravel())

In [None]:
preds = XGB.predict(scaled_test_x)

****

In [None]:
result = pd.DataFrame({"Id" : test_Id,"SalePrice" : preds.ravel()})
result.to_csv("predictions_house_regression.csv",index=False)

In [None]:
result