In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

#KERAS MODEL DEFINITION
from keras.layers import Input, Activation, Dropout, Dense, BatchNormalization, Embedding, concatenate, GRU, Flatten, Lambda, Reshape
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers
#from keras.callbacks import ModelCheckpoint
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
# GPU usage
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

%matplotlib inline

Using TensorFlow backend.


# Data load

## Train

In [None]:
train_df = pd.read_csv('../dat/train.csv')
print(train_df.shape)


# Features correlations to target

In [None]:
# Find most important features relative to target
print("Find most important features relative to target")
corr = train_df.corr()
corr.sort_values(["SalePrice"], ascending = False, inplace = True)
print(corr.SalePrice)
corr.SalePrice.plot(kind = "barh")
#corr.hist()
#sns.heatmap(corr)

# Target

In [None]:
y_ = train_df.SalePrice

In [None]:
# Hist
plt.figure(figsize=(12,8))
#sns.distplot(train_df.price.values, bins=50, kde=False)
y_.hist(bins=50)
plt.xlabel('price', fontsize=12)
plt.show()

Note how the price is skewed. 

In [None]:
# Log transform of the skewed numerical features to lessen impact of outliers
# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
print(skew(y_)) # >0.5


In [None]:
y_ = train_df.SalePrice
y = y_log = np.log1p(y_)
train_df = train_df.drop('SalePrice', axis=1)
train_idx = len(train_df)

In [None]:
# Hist log(price)
import numpy as np
plt.figure(figsize=(12,8))
# log(price + 1) to avoid 0's
#sns.distplot(np.log(train_df.price + 1).values, bins=50, kde=False)
# Same as:
#sns.distplot(np.log(train_df['price'] + 1).values, bins=50, kde=False)
# Same as :
#np.log(train_df.price + 1).hist(bins=50)
# Same as:
y_log.hist(bins=50)
plt.xlabel('price', fontsize=12)
plt.show()

In [None]:
print(skew(y_log)) # <0.5

## Test

In [None]:
test_df = pd.read_csv('../dat/test.csv')
print(test_df.shape)
len(test_df.columns) == len(train_df.columns)

## Merge/Concat

In [None]:
df = pd.concat([train_df, test_df], axis=0)
df.shape

# Handle missing vals

In [None]:
# Number of colomns with nulls
def check_nulls(df):
    print(df.isnull().sum())
    return len(df.isnull().sum().nonzero()[0])

In [None]:
print(check_nulls(df))

## Special colomns handling

Handle missing values for features where median/mean or most common value doesn't make sense

In [None]:


# Alley : data description says NA means "no alley access"
df.loc[:, "Alley"] = df.loc[:, "Alley"].fillna("NA")
# BedroomAbvGr : NA most likely means 0
df.loc[:, "BedroomAbvGr"] = df.loc[:, "BedroomAbvGr"].fillna(0)
# BsmtQual etc : data description says NA for basement features is "no basement"
df.loc[:, "BsmtQual"] = df.loc[:, "BsmtQual"].fillna("No")
df.loc[:, "BsmtCond"] = df.loc[:, "BsmtCond"].fillna("No")
df.loc[:, "BsmtExposure"] = df.loc[:, "BsmtExposure"].fillna("No")
df.loc[:, "BsmtFinType1"] = df.loc[:, "BsmtFinType1"].fillna("No")
df.loc[:, "BsmtFinType2"] = df.loc[:, "BsmtFinType2"].fillna("No")
df.loc[:, "BsmtFullBath"] = df.loc[:, "BsmtFullBath"].fillna(0)
df.loc[:, "BsmtHalfBath"] = df.loc[:, "BsmtHalfBath"].fillna(0)
df.loc[:, "BsmtUnfSF"] = df.loc[:, "BsmtUnfSF"].fillna(0)
# CentralAir : NA most likely means No
df.loc[:, "CentralAir"] = df.loc[:, "CentralAir"].fillna("N")
# Condition : NA most likely means Normal
df.loc[:, "Condition1"] = df.loc[:, "Condition1"].fillna("Norm")
df.loc[:, "Condition2"] = df.loc[:, "Condition2"].fillna("Norm")
# EnclosedPorch : NA most likely means no enclosed porch
df.loc[:, "EnclosedPorch"] = df.loc[:, "EnclosedPorch"].fillna(0)
# External stuff : NA most likely means average
df.loc[:, "ExterCond"] = df.loc[:, "ExterCond"].fillna("TA")
df.loc[:, "ExterQual"] = df.loc[:, "ExterQual"].fillna("TA")
# Fence : data description says NA means "no fence"
df.loc[:, "Fence"] = df.loc[:, "Fence"].fillna("No")
# FireplaceQu : data description says NA means "no fireplace"
df.loc[:, "FireplaceQu"] = df.loc[:, "FireplaceQu"].fillna("No")
df.loc[:, "Fireplaces"] = df.loc[:, "Fireplaces"].fillna(0)
# Functional : data description says NA means typical
df.loc[:, "Functional"] = df.loc[:, "Functional"].fillna("Typ")
# GarageType etc : data description says NA for garage features is "no garage"
df.loc[:, "GarageType"] = df.loc[:, "GarageType"].fillna("No")
df.loc[:, "GarageFinish"] = df.loc[:, "GarageFinish"].fillna("No")
df.loc[:, "GarageQual"] = df.loc[:, "GarageQual"].fillna("No")
df.loc[:, "GarageCond"] = df.loc[:, "GarageCond"].fillna("No")
df.loc[:, "GarageArea"] = df.loc[:, "GarageArea"].fillna(0)
df.loc[:, "GarageCars"] = df.loc[:, "GarageCars"].fillna(0)
# HalfBath : NA most likely means no half baths above grade
df.loc[:, "HalfBath"] = df.loc[:, "HalfBath"].fillna(0)
# HeatingQC : NA most likely means typical
df.loc[:, "HeatingQC"] = df.loc[:, "HeatingQC"].fillna("TA")
# KitchenAbvGr : NA most likely means 0
df.loc[:, "KitchenAbvGr"] = df.loc[:, "KitchenAbvGr"].fillna(0)
# KitchenQual : NA most likely means typical
df.loc[:, "KitchenQual"] = df.loc[:, "KitchenQual"].fillna("TA")
# LotFrontage : NA most likely means no lot frontage
df.loc[:, "LotFrontage"] = df.loc[:, "LotFrontage"].fillna(0)
# LotShape : NA most likely means regular
df.loc[:, "LotShape"] = df.loc[:, "LotShape"].fillna("Reg")
# MasVnrType : NA most likely means no veneer
df.loc[:, "MasVnrType"] = df.loc[:, "MasVnrType"].fillna("None")
df.loc[:, "MasVnrArea"] = df.loc[:, "MasVnrArea"].fillna(0)
# MiscFeature : data description says NA means "no misc feature"
df.loc[:, "MiscFeature"] = df.loc[:, "MiscFeature"].fillna("No")
df.loc[:, "MiscVal"] = df.loc[:, "MiscVal"].fillna(0)
# OpenPorchSF : NA most likely means no open porch
df.loc[:, "OpenPorchSF"] = df.loc[:, "OpenPorchSF"].fillna(0)
# PavedDrive : NA most likely means not paved
df.loc[:, "PavedDrive"] = df.loc[:, "PavedDrive"].fillna("N")
# PoolQC : data description says NA means "no pool"
df.loc[:, "PoolQC"] = df.loc[:, "PoolQC"].fillna("No")
df.loc[:, "PoolArea"] = df.loc[:, "PoolArea"].fillna(0)
# SaleCondition : NA most likely means normal sale
df.loc[:, "SaleCondition"] = df.loc[:, "SaleCondition"].fillna("Normal")
# ScreenPorch : NA most likely means no screen porch
df.loc[:, "ScreenPorch"] = df.loc[:, "ScreenPorch"].fillna(0)
# TotRmsAbvGrd : NA most likely means 0
df.loc[:, "TotRmsAbvGrd"] = df.loc[:, "TotRmsAbvGrd"].fillna(0)
# Utilities : NA most likely means all public utilities
df.loc[:, "Utilities"] = df.loc[:, "Utilities"].fillna("AllPub")
# WoodDeckSF : NA most likely means no wood deck
df.loc[:, "WoodDeckSF"] = df.loc[:, "WoodDeckSF"].fillna(0)
# LandSlope : Most likely NA means Mod
df.loc[:, "LandSlope"] = df.loc[:, "LandSlope"].fillna("Mod")
# Street : Most likely NaN means Pave
df.loc[:, "Street"] = df.loc[:, "Street"].fillna("Pave")

# Ordinal values

Encode some categorical features as ordered numbers when there is information in the order.



## Case 1: Ordinals as numerical features

In this setting, we will replace all the ordinal values with numerical ones. 

They will be scaled normally later.

Now, for the sprecially handled missing values, (like None or No), these need to be given an ordinal number as well. Otherwise, the None or No will be counted as strings and the whole var is counted as object, and hence will be handled with OHE+embedding



In [None]:

df = df.replace({"Alley" : {"NA" : 0, "Grvl" : 1, "Pave" : 2},
                       "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 0, "Fa" : 1, "TA": 2, "Gd": 3, "Ex" : 4},
                       "ExterQual" : {"Po" : 0, "Fa" : 1, "TA": 2, "Gd": 3, "Ex" : 4},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 0, "Sev" : 1, "Maj2" : 2, "Maj1" : 3, "Mod": 4, 
                                       "Min2" : 5, "Min1" : 6, "Typ" : 7},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "KitchenQual" : {"Po" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "LandSlope" : {"Sev" : 0, "Mod" : 1, "Gtl" : 2},
                       "LotShape" : {"IR3" : 0, "IR2" : 1, "IR1" : 2, "Reg" : 3},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 0, "Pave" : 1},
                       "Utilities" : {"ELO" : 0, "NoSeWa" : 1, "NoSewr" : 2, "AllPub" : 3}}
                     )

In [None]:
df.head()


In [None]:
#ordinal_cols = ["Alley", "BedroomAbvGr", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "BsmtFullBath", "BsmtHalfBath", "BsmtUnfSF", "CentralAir", "Condition1", "Condition2", "EnclosedPorch", "ExterCond", "ExterQual", "Fence", "FireplaceQu", "Fireplaces", "Functional", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "GarageArea", "GarageCars", "HalfBath", "HeatingQC", "KitchenAbvGr", "KitchenQual", "LotFrontage", "LotShape", "MasVnrType", "MasVnrArea", "MiscFeature", "MiscVal", "OpenPorchSF", "PavedDrive", "PoolQC", "PoolArea", "SaleCondition", "ScreenPorch", "TotRmsAbvGrd", "Utilities", "WoodDeckSF", "Street", "LandSlope"]
ordinal_cols = ["Alley", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "BsmtQual", "ExterCond", "ExterQual", "FireplaceQu", "Functional", "GarageCond", "GarageQual", "HeatingQC", "KitchenQual", "LandSlope", "LotShape", "PavedDrive", "PoolQC", "Street", "Utilities"]
print(df[ordinal_cols].dtypes)
df[ordinal_cols].head(3)

Check all types are not objects (otherwise they will not be counted as numericals if they are objects)

In [None]:
ordinal_obj_cols = df[ordinal_cols].select_dtypes(include = ["object"]).columns
print(len(ordinal_obj_cols))
#df[ordinal_obj_cols]

Check all nominal values have no missing values

In [None]:
print(check_nulls(df[ordinal_cols]))

Check some ordinals before transformations

In [None]:
for col in ordinal_cols:
    print(df[col].describe())

# Case 2: ordinal as OHE

In [None]:
'''
# Encode some categorical features as ordered numbers when there is information in the order
df["Alley"].astype('object')
df["BsmtCond"].astype('object')


"BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 0, "Fa" : 1, "TA": 2, "Gd": 3, "Ex" : 4},
                       "ExterQual" : {"Po" : 0, "Fa" : 1, "TA": 2, "Gd": 3, "Ex" : 4},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 0, "Sev" : 1, "Maj2" : 2, "Maj1" : 3, "Mod": 4, 
                                       "Min2" : 5, "Min1" : 6, "Typ" : 7},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "KitchenQual" : {"Po" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "LandSlope" : {"Sev" : 0, "Mod" : 1, "Gtl" : 2},
                       "LotShape" : {"IR3" : 0, "IR2" : 1, "IR1" : 2, "Reg" : 3},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 0, "Pave" : 1},
                       "Utilities" : {"ELO" : 0, "NoSeWa" : 1, "NoSewr" : 2, "AllPub" : 3}}
                     )
                     
'''                     

## Get categorial colomns

# Some numerical features are actually really categories

In [None]:
# Some numerical features are actually really categories
df = df.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

In [None]:
categorical_features = df.select_dtypes(include = ["object"]).columns
print("Categorical features : " + str(len(categorical_features)))
df_cat = df[categorical_features]
print(df_cat.shape)

In [None]:
print(check_nulls(df_cat))

## Fill catgorial Nulls as "None"

In [None]:
df_cat = df_cat.fillna("None")
print(check_nulls(df_cat))

## Get numerical colomns

In [None]:
numerical_features = df.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("Id")
print("Numerical features : " + str(len(numerical_features)))
df_num = df[numerical_features]
#df_num = df_num.drop('Id', 1)

In [None]:
check_nulls(df_num)

Check some ordinals before transformations

In [None]:
for col in ordinal_cols:
    print(df_num[col].describe())

### Fillna of the remaining numerical colomns as median

In [None]:
df_num = df_num.fillna(df_num.median())

Check some ordinals before transformations

In [None]:
for col in ordinal_cols:
    print(df_num[col].describe())

### Handle skewed numberical cols as log

Check some ordinals before transformations

In [None]:
for col in ordinal_cols:
    print(df_num[col].describe())

Note: you will never reach 0 skewed features!

In [None]:
# Log transform of the skewed numerical features to lessen impact of outliers
# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
skew_cols = [col for col in df_num.columns if col not in ordinal_cols]

skewness = df_num[skew_cols].apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")

skewed_features = skewness.index
print(skewed_features)
df_num[skewed_features] = np.log1p(df_num[skewed_features])

## Scaling the numerical features

Check some ordinals before transformations

In [None]:
for col in ordinal_cols:
    print(df_num[col].describe())

In [None]:
df_num.head()

In [None]:
scaler = StandardScaler()
# The next two are wrong because they give 1D array while the scaler expects 2D, a row for each measurement
#s = scaler.fit_transform(df_num["LotFrontage"])
#s = scaler.fit_transform(df_num["LotFrontage"].values)
s = scaler.fit_transform(df_num[["LotFrontage"]])
#s = scaler.fit_transform([df_num["LotFrontage"].values])# This one is wrong because it will give ALL the entries as one entry
s
#df_num[["LotFrontage"]]
#[df_num["LotFrontage"].values]

In [None]:
scaler = StandardScaler()
d = df_num
for col in df_num.columns:
    d[col] = scaler.fit_transform(df_num[[col]])

With StandardScaler we get range [-1,1]. This is not exactly what we want, because we don't have negatives. All what we want is to scale in the range [0,1].

For that, we have to use MinMaxScaler instead:
    
    X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    X_scaled = X_std * (max - min) + min
    


In [None]:
d.head()

In [None]:
del d

In [None]:
scaler = MinMaxScaler()
for col in df_num.columns:
    df_num[col] = scaler.fit_transform(df_num[[col]])

In [None]:
df_num.head()

Check some ordinals after transformations

In [None]:
for col in ordinal_cols:
    print(df_num[col].describe())

# Do NOT encode categorial encoding as OHE for NN
We are going to use embedding tables. So the input is an index.

In [None]:
print(df_cat.shape)
df_cat.head()


## Fit a LabelEncoder
Since we want an index, then we need a LabelEncoder per each categorial column

We will arrange the embeddings tables info in tuples:
    (max_num_categories, emb_sz)
    
    emb_sz = min(50, (max_num_categories+1)/2)

In [None]:
le = LabelEncoder()
embeddings = []
for col in categorical_features:
    df_cat[col] = le.fit_transform(df_cat[col])
    # +2 -> +1 for the UNK and +1 for the max itself
    embeddings.append((df_cat[col].max() + 2, min(50, int(df_cat[col].max()/2) + 1)))

In [None]:
print(df_cat.shape)
df_cat.head()

In [None]:
print(len(embeddings))
embeddings

In [None]:
total_emb_concat_sz = 0
for emb in embeddings:
    total_emb_concat_sz += emb[1]
    
print(total_emb_concat_sz)

## Merge numerical and categorial colomns again

In [None]:
df = pd.concat([df_num, df_cat], axis = 1)
df.shape

In [None]:
print(len(df.columns))
print(df.columns)
print(len(numerical_features))
print(numerical_features)
print(len(categorical_features))
print(categorical_features)
#categorical_features_OHE = df_cat.columns
#print(len(categorical_features_OHE))
#print(categorical_features_OHE)
numerical_cols_last_idx = len(numerical_features)
categorial_cols_start_idx = numerical_cols_last_idx

So we have `len(numerical_features)` embeddings table.



## Re-split train and test data

In [None]:
X = df[:train_idx]
X_test = df[train_idx:]
print(train_df.shape)
print(test_df.shape)

## Train/Val split

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)
print(X.shape)
print(y.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)

In [None]:
#params
dr_r = 0.1


#Inputs
input_l1 = Input(shape=[X.shape[1],])
#input_l = K.expand_dims(input_l, axis = -1)
#print(input_l._keras_history)
input_l = Reshape([X.shape[1],1])(input_l1)
print(input_l.shape)


# A. Categorial/Embedding stream

# Slice the input, for each category, at the poisition = categorial_cols_start_idx + i
category = []
for i in range(len(embeddings)):
    category.append(Lambda(lambda x: x[:,categorial_cols_start_idx + i])(input_l))

#Embeddings layers
emb_category = []
for i in range(len(embeddings)):
    emb_category.append(Embedding(embeddings[i][0], embeddings[i][1])(category[i]))

# Concat all embeddings
cat_l = Flatten()(emb_category[0])
for i in range(len(embeddings)-1): 
    cat_l = concatenate([cat_l, Flatten()(emb_category[i+1])])

print(cat_l.shape)

# B. Numerical stream

# Slice the numerical part of the input
numericals = Lambda(lambda x: x[:, :categorial_cols_start_idx])(input_l)
# Dense numerical layers
num_l = Flatten()(numericals)
num_l = Dense(100)(num_l)

print(num_l.shape)
    
# Concat numericals + categorial
main_l = concatenate([num_l, cat_l])

print(main_l.shape)

#main_l = Dropout(dr_r) (Dense(128) (main_l))
main_l = Dense(100) (main_l)
main_l = Dense(50) (main_l)
main_l = Dense(25) (main_l)
main_l = Dense(10) (main_l)
main_l = Dense(5) (main_l)


#output
output = Dense(1, activation="linear") (main_l)

#model
model = Model(input_l1, output)
model.summary()


In [None]:

#lr = 0.007
#optimizer = optimizers.Adam(lr)
optimizer = optimizers.Adam()
#optimizer = optimizers.RMSprop()
model.compile(loss="mse", 
              optimizer=optimizer)

epochs = 200
BATCH_SIZE = 10
steps = int(len(X_train)/BATCH_SIZE) * epochs
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1

lr_init, lr_fin = 0.009, 0.006
lr_decay = exp_decay(lr_init, lr_fin, steps)

#K.set_value(model.optimizer.lr, lr_init)
#K.set_value(model.optimizer.decay, lr_decay)

checkpointer = ModelCheckpoint(filepath="weights.hdf5", verbose=1, save_best_only=True)
#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#X_train = np.expand_dims(X_train, axis=-1)
#X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))
#X_val = np.expand_dims(X_val, axis=-1)
print(X_train.shape)
history = model.fit(X_train, Y_train
                    , epochs=epochs
                    , batch_size=BATCH_SIZE
                    , validation_data = (X_val, Y_val)
                    #, validation_split=0.01
                    #, callbacks=[TensorBoard('./logs/'+log_subdir)]
                    , verbose=1
                    , callbacks=[checkpointer]
                    )


# Evaluate

In [None]:
# make predictions
model.load_weights('weights.hdf5')
predictions = model.predict(X_val)


print("Mean Absolute Error : " + str(mean_absolute_error(predictions, Y_val)))
print("Mean Squared Error : " + str(mean_squared_error(predictions, Y_val)))
print("Root Mean Squared Error : " + str(np.sqrt(mean_squared_error(predictions, Y_val))))

# Submit

In [None]:
# Model aggregation
#X_test = np.expand_dims(X_test, axis=-1)
predictions = model.predict(X_test.as_matrix())

In [None]:
predicted_prices = np.expm1(predictions)

In [None]:
predicted_prices = predicted_prices.reshape([1459,])
predicted_prices.shape

In [None]:
my_submission = pd.DataFrame({'Id': test_df.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)


