In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import warnings
warnings.filterwarnings('ignore')
# working in kaggle

kaggle_data_path = '../input'
local_data_path = 'data'

data_path = kaggle_data_path if os.path.exists(kaggle_data_path) else local_data_path

%ls $data_path

In [None]:
# check train data
data_train = pd.read_csv(f'{data_path}/train.csv')
data_test = pd.read_csv(f'{data_path}/test.csv')

print('train data shape: ', data_train.shape)
print('test data shape: ', data_test.shape)

In [None]:
# Target variable

y_data = data_train['SalePrice']
data_train.drop('SalePrice', axis=1, inplace=True)

In [None]:
data_train.describe().T

In [None]:
# check missing data and percentage of total
data_train.isnull().sum().sort_values(ascending=False)[:10]/len(data_train)

In [None]:
data_test.head()

In [None]:
# check missing data
data_test.isnull().sum().sort_values(ascending=False)[:10]/len(data_test)

## Missing values

Both training and test data have 6 features with missing values more than 50% of the total number of rows. These features are: `PoolQC`, `MiscFeature`, `Alley`, `Fence`, `MasVnrType`, and `FireplaceQu`.

In [None]:
# to be able to work with a single dataset
data = pd.concat([data_train, data_test], axis=0)
# remove missing data
data = data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1)
# id is index
data.set_index('Id', inplace=True)

In [None]:
# missing by type
data.isna().sum().groupby(data.dtypes).sum()

We have three types of features: `int64`, `float64`, and `object`. We will fill the missing values with the mean for `int64` and `float64` features, and with the mode for `object` features.

In [None]:
# check data float 
data_float = data.select_dtypes(include=['float64'])
data_float.head()

In [None]:
# complete missing data with mean
fill_with_mean = data_float[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageYrBlt', 'GarageArea']].fillna(data_float.mean())
# check missing data with median
fill_with_median = data_float[['BsmtFullBath', 'BsmtHalfBath', 'GarageCars']].fillna(data_float.median())
# update data
data_float.update(fill_with_mean)
data_float.update(fill_with_median)

In [None]:
# check float missing data
data.update(data_float)
data_float.isna().sum().sort_values(ascending=False)[:10]/len(data_float)

In [None]:
# check data object

data_object = data.select_dtypes(include=['object'])
data_object.head()

In [None]:
# complete missing data with mode
data_object = data_object.fillna(data_object.mode().iloc[0])
# convert to category
data_cat = data_object.astype('category')
# updata data
data_cat.head()

# Odinal features

We need to convert ordinal features to numerical values

In [None]:
le = LabelEncoder()

data_encode = data_cat.apply(le.fit_transform)
data_encode.head()

In [None]:
# update data
data.update(data_encode)

# check missing data
data.isna().sum().sort_values(ascending=False)

In [None]:
# data

data_train = data.loc[data_train['Id']]
data_test = data.loc[data_test['Id']]
X_test = data_test

In [None]:
X_train, X_cv, y_train, y_cv = train_test_split(data_train, y_data, test_size=0.2)
scores = pd.DataFrame(columns=['train', 'cv'])

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

scores.loc['lenealRegression'] = mean_squared_error(y_train, lr.predict(X_train)), mean_squared_error(y_cv, lr.predict(X_cv))
scores.head()

In [None]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

grid_params = {'alpha': alpha}

grid_search = GridSearchCV(Lasso(), grid_params, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print('best_params: ', best_params)

lasso = Lasso(**best_params)
lasso.fit(X_train, y_train)

scores.loc['lasso'] = mean_squared_error(y_train, lasso.predict(X_train)), mean_squared_error(y_cv, lasso.predict(X_cv))
scores.head()

In [None]:
grid_search = GridSearchCV(Ridge(), grid_params, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print('best_params: ', best_params)

ridge = Ridge(**best_params)
ridge.fit(X_train, y_train)

scores.loc['ridge'] = mean_squared_error(y_train, ridge.predict(X_train)), mean_squared_error(y_cv, ridge.predict(X_cv))
scores.head()

In [None]:
grid_params = {'alpha': alpha, 'l1_ratio': alpha}

grid_search = GridSearchCV(ElasticNet(), grid_params, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print('best_params: ', best_params)

elastic = ElasticNet(**best_params)
elastic.fit(X_train, y_train)

scores.loc['elastic'] = mean_squared_error(y_train, elastic.predict(X_train)), mean_squared_error(y_cv, elastic.predict(X_cv))
scores.head()

In [None]:
# save data to csv
y_pred = elastic.predict(X_test)
pred_df = pd.DataFrame(y_pred, index=data_test.index, columns=['SalePrice'])
pred_df.to_csv('data/elastic_submission.csv')
pred_df.head()

In [None]:
grid_params = {
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_child_weight': [1, 2, 3, 4],
    'alpha': [0.01, 0.1, 0.5, 1, 10],
    'lambda': [0.01, 0.1, 0.5, 1, 10],
}

grid_search = GridSearchCV(XGBRegressor(), grid_params, cv=5)
grid_search.fit(X_train.values, y_train.values)
best_params = grid_search.best_params_

xgb_model = XGBRegressor(**best_params)
xgb_model.fit(X_train.values, y_train.values)

scores.loc['xgb'] = mean_squared_error(y_train, xgb_model.predict(X_train.values)), mean_squared_error(y_cv, xgb_model.predict(X_cv.values))
scores.head()

In [None]:
# graph of scores
plt.scatter(scores.index, scores['train'], label='train')
plt.scatter(scores.index, scores['cv'], label='cv')
plt.legend()
plt.show()

In [None]:
y_pred = xgb_model.predict(X_test.values)

pred_df = pd.DataFrame(y_pred, index=data_test.index, columns=['SalePrice'])
pred_df.to_csv('data/xgb_submission.csv')

In [None]:
model = keras.Sequential([
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.fit(X_train.values, y_train.values, epochs=100, batch_size=32, validation_data=(X_cv.values, y_cv.values))

