In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
train_y = train_data[['SalePrice']]
train_X = train_data.drop(columns=['Id', 'SalePrice'])
test_data = test_data.drop(columns=['Id'])

In [4]:
# Get object data columns and numeric data columns
obj_col = list(train_X.select_dtypes(include=['object']).columns)
num_col = list(train_X.select_dtypes(include=['int','float']).columns)

In [5]:
split = train_X.shape[0]

In [6]:
# Concatenate train_X and test_data to perform One-hot-encoding on categorical columns
data = pd.concat([train_X, test_data],ignore_index=True)

In [7]:
data[obj_col] = data[obj_col].fillna("0")

In [8]:
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
new_df = pd.DataFrame(ohe.fit_transform(data[obj_col]))
data = data.join(new_df)

In [9]:
data = data.drop(columns=obj_col)

In [10]:
# split train and test data
train_X = data.iloc[:split,:]
test_data = data.iloc[split:,:]

In [11]:
# Fill nan in numeric columns with the mean value of that column
for col in num_col:
    mean = train_X[col].mean(skipna=True)
    train_X[col] = train_X[col].fillna(mean)
    mean = test_data[col].mean(skipna=True)
    test_data[col] = test_data[col].fillna(mean)

In [12]:
"""
# Choose features based on correlation with SalePrice
# Weak correlation [-0.5, 0.5]
corr = train_X.corrwith(train_data['SalePrice'])
corr_min = corr.min() + corr.std()
corr_max = corr.max() - corr.std()
corr0 = corr[corr > 0.5]
corr1 = corr[corr < -0.5]
corr = pd.concat([corr0, corr1])
column = corr.index
"""

In [13]:
train_X.isnull().any()

In [14]:
# Converting dataframe to numpy array
X_train = train_X.to_numpy()
y_train = train_y.to_numpy()
X_test = test_data.to_numpy()

In [15]:
sfm = SelectFromModel(GradientBoostingRegressor())
sfm.fit(X_train, y_train)
sfm.get_support()
selected_col= train_X.columns[(sfm.get_support())]
selected_col

In [16]:
train_X = train_X[selected_col]
test_data = test_data[selected_col]

In [17]:
"""
# Adding polynomial features
col = train_X.columns
num = len(col)
for i in range(num):
    for j in range(num):
        new_col = col[i] + "_" + col[j]
        train_X[new_col] = train_X[col[i]] * train_X[col[j]]
        test_data[new_col] = test_data[col[i]] * test_data[col[j]]
"""

In [18]:
"""
# After adding polynomial data, keep features with correlation larger than 0.7
corr = train_X.corrwith(train_data['SalePrice'])
corr_min = corr.min() + corr.std()
corr_max = corr.max() - corr.std()
corr = corr[corr > 0.7]
column = corr.index
train_X = train_X[column]
test_data = test_data[column]
"""

In [19]:
X_train = train_X.to_numpy()
X_test = test_data.to_numpy()

In [20]:
# Use grid search to get the optimal hyperparameters
"""
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
learning_rate = [0.01, 0.1]
min_samples_split = [2, 4, 5, 10]
min_samples_leaf = [1, 2, 4]
max_depth = [3,4,5,10]
loss = ['squared_error', 'absolute_error', 'huber', 'quantile']

opt_dict = {'n_estimators': n_estimators,
            'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'loss': loss}

model = GradientBoostingRegressor()

model = RandomizedSearchCV(model, opt_dict, n_iter=100)
#y_train = np.reshape(y_train, (y_train.shape[0],))
model.fit(X_train,y_train)
print(model.best_score_)
print(model.best_params_)
"""

In [21]:
# Apply optimal hyperparameters to the model
model = GradientBoostingRegressor(learning_rate=0.1, loss='squared_error', max_depth=3, min_samples_split=5, n_estimators=100)
result = cross_validate(model, X_train, y_train, return_estimator=True)

In [22]:
max_index = result['test_score'].argmax()
model = result['estimator'][max_index]

In [23]:
output = model.predict(X_test)

In [24]:
index_array = np.zeros(np.shape(output))
for i in range(np.shape(index_array)[0]):
    index_array[i] = 1461+i
index_array = np.reshape(index_array, (np.shape(index_array)[0],1))
output = np.reshape(output, (np.shape(output)[0],1))

In [25]:
index = pd.DataFrame(index_array, columns = ['Id'], dtype=int)
sale = pd.DataFrame(output, columns = ['SalePrice'], dtype=float)
output_df = index.join(sale)

In [26]:
output_df.to_csv('output.csv',index=False)