In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
train_y = train_data[['SalePrice']]
train_X = train_data.drop(['SalePrice'],axis=1)

In [4]:
# Choose features based on correlation with SalePrice
# Weak correlation [-0.5, 0.5]
corr = train_X.corrwith(train_data['SalePrice'])
corr_min = corr.min() + corr.std()
corr_max = corr.max() - corr.std()
corr0 = corr[corr > 0.5]
corr1 = corr[corr < -0.5]
corr = pd.concat([corr0, corr1])
column = corr.index

In [5]:
train_X = train_X[column]
test_data = test_data[column]

In [6]:
# Get object data columns and numeric data columns
obj_col = list(train_X.select_dtypes(include=['object']).columns)
num_col = list(train_X.select_dtypes(include=['int','float']).columns)

In [7]:
# Transform object data to numeric for both train and test data (fill nan with 0)
for col in obj_col:
    val = np.stack(train_X[col].unique())
    for i in range(len(val)):
        if val[i] == 'nan':
            val = np.delete(val, i)
            break
    train_X[col] = train_X[col].fillna(0)
    test_data[col] = test_data[col].fillna(0)


In [8]:
# Fill nan in numeric columns with the mean value of that column
for col in num_col:
    train_X[col].fillna(value=train_X[col].mean(), inplace=True)
    test_data[col].fillna(value=test_data[col].mean(), inplace=True)

In [9]:
# Remove outlier for all the numeric columns
# Use Inter-Quartile Range (IQR) proximity rule.
# The data points which fall below Q1 – 1.5 IQR or above Q3 + 1.5 IQR are outliers.
# where Q1 and Q3 are the 25th and 75th percentile of the dataset respectively,
# and IQR represents the inter-quartile range and given by Q3 – Q1.
"""
for col in train_X.columns:
    q1 = train_X[col].quantile(0.25)
    q3 = train_X[col].quantile(0.75)
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * (iqr)
    lower_bound = q1 - 1.5 * (iqr)
    train_X.loc[train_X[col] < lower_bound, col] = np.nan
    train_X.loc[train_X[col] > upper_bound, col] = np.nan
    mean = train_X[col].mean()
    train_X[col] = train_X[col].fillna(mean)
"""

In [10]:
# Adding polynomial features
col = train_X.columns
num = len(col)
for i in range(num):
    for j in range(num):
        new_col = col[i] + "_" + col[j]
        train_X[new_col] = train_X[col[i]] * train_X[col[j]]
        test_data[new_col] = test_data[col[i]] * test_data[col[j]]

In [11]:
# After adding polynomial data, keep features with correlation larger than 0.7
corr = train_X.corrwith(train_data['SalePrice'])
corr_min = corr.min() + corr.std()
corr_max = corr.max() - corr.std()
corr = corr[corr > 0.7]
column = corr.index
train_X = train_X[column]
test_data = test_data[column]

In [12]:
# Converting dataframe to numpy array
X_train = train_X.to_numpy()
y_train = train_y.to_numpy()
X_test = test_data.to_numpy()

In [13]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
result = cross_validate(model, X_train, y_train, return_estimator=True)

In [14]:
max_index = result['test_score'].argmax()
model = result['estimator'][max_index]

In [15]:
output = model.predict(X_test)

In [16]:
index_array = np.zeros(np.shape(output))
for i in range(np.shape(index_array)[0]):
    index_array[i] = 1461+i
index_array = np.reshape(index_array, (np.shape(index_array)[0],1))
output = np.reshape(output, (np.shape(output)[0],1))
output = np.hstack((index_array,output))

In [17]:
output_df = pd.DataFrame(output, columns = ['Id', 'SalePrice'], dtype=int)
output_df.to_csv('output.csv',index=False)