In [None]:
!pip3 install numpy --upgrade
!pip3 install pandas --upgrade

In [None]:
# import python libraries                                   
import os     

# data analysis
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sklearn utilities
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# prediction
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

In [None]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"}

# set an output path where the trained model will be saved
bucket = 'lawsnic-east1'
prefix = 'kaggle/house-prices-advanced-regression-techniques' 
output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'abalone-xgb-built-in-algo')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", "us-east-2", "1.5-1")

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)

train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

pd.options.display.max_columns = train_data.shape[1]

In [None]:
train_data.shape

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe(include = 'all')

In [None]:
train_data = train_data.drop(columns=['Id','PoolQC', 'MiscFeature', '2ndFlrSF','FireplaceQu','YearBuilt','GarageCars'])
test_data = test_data.drop(columns=['Id','PoolQC', 'MiscFeature', '2ndFlrSF','FireplaceQu','YearBuilt','GarageCars'])

In [None]:
train_data['MSSubClass'] = ((train_data['MSSubClass'].astype(int)/10)+65).apply(int).apply(chr)
test_data['MSSubClass'] = ((test_data['MSSubClass'].astype(int)/10)+65).apply(int).apply(chr)

In [None]:
train_data_num = train_data.select_dtypes(exclude=['object'])
test_data_num = test_data.select_dtypes(exclude=['object'])
train_data_num.head()

In [None]:
train_data_cat = train_data.select_dtypes(include=['object'])
test_data_cat = test_data.select_dtypes(include=['object'])
train_data_cat.head()

In [None]:
train_data_num.hist(figsize=(25, 30), bins=30);

In [None]:
#fig, ax = plt.subplots(1, len(train_data_cat.columns))
fig, ax = plt.subplots(45,figsize=(30,100))
for i, categorical_feature in enumerate(train_data_cat):
    train_data_cat[categorical_feature].value_counts().plot(kind="bar", ax=ax[i]).set_title(categorical_feature)
    #train_data_cat[categorical_feature].value_counts().plot(subplots=True, kind="bar")
fig.show()

In [None]:
selector = VarianceThreshold(threshold=0.05)

selector.fit(train_data_num.iloc[:, :-1])

sup = selector.get_support()

print('Number of retained features: ', sum(sup))

print('Number low-variance features: ', sum(~sup))

low_var_fet = train_data_num.drop(['SalePrice'], axis=1).loc[:, ~sup].columns.values

print('Low-variance features: ', low_var_fet)

print('Before: ',train_data_num.shape, test_data_num.shape)
train_data_num.drop(low_var_fet, axis=1, inplace=True)
test_data_num.drop(low_var_fet, axis=1, inplace=True)
print('After: ', train_data_num.shape, test_data_num.shape)

In [None]:
pd.options.display.float_format = "{:,.2f}".format

corr_mat = train_data_num.corr('pearson')

# replace very weak correlation
corr_mat[(corr_mat < 0.3) & (corr_mat > -0.3)] = 0

# define triangular mask for better visibility
mask = np.triu(np.ones_like(corr_mat, dtype=bool))
plt.figure(figsize=(20, 20))
sns.heatmap(corr_mat, mask=mask, vmax=1.0, vmin=-1.0, square=True, annot=True, annot_kws={"size": 9, "color": "black"}, linewidths=0.1, cmap='rocket');

In [None]:
corr_features = corr_mat['SalePrice'].drop(['SalePrice'])
corr_features.sort_values(ascending=False)

In [None]:
chosen_feats = corr_features[(abs(corr_features) >= 0.3)].index.tolist()
#chosen_feats
cleaned_train_data = train_data[['SalePrice'] + chosen_feats]
cleaned_test_data = test_data[chosen_feats]



In [None]:
splitData_train, splitData_val, splitData_test = np.split(cleaned_train_data.sample(frac=1, random_state=1729), [int(0.7 * len(cleaned_train_data)), int(0.9 * len(cleaned_train_data))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%

splitData_train.to_csv('train/train.csv', index=False, header=False)
splitData_val.to_csv('validation/validation.csv', index=False, header=False)
 
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'input/train/train.csv')).upload_file('train/train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'input/validation/validation.csv')).upload_file('validation/validation.csv')

In [None]:
sess = sagemaker.Session()
role = get_execution_role()

xgb = sagemaker.estimator.Estimator(
    sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "1.5-1"),
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=100,
)

In [None]:
s3_input_train = TrainingInput(    s3_data="s3://{}/{}/input/train/".format(bucket, prefix), content_type="csv")
s3_input_validation = TrainingInput(    s3_data="s3://{}/{}/input/validation/".format(bucket, prefix), content_type="csv")

xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

In [None]:
os.path.join(prefix, 'train/train.csv')