In [2]:
# Keras Nerual Network Model Notebook

In [3]:
# ---------------------------------- #

In [6]:
# import functions from model_functions notebook

In [325]:
top_features = [
     'LotArea',
     'LotFrontage',
     'BsmtUnfSF',
     'GrLivArea',
     'OverallQual',
     'HouseSFScore',
     'BsmtFinSF1',
     'GarageArea',
     'PorchScore',
     'BsmtIncompleteRatio',
     '2ndFlrSF',
     'GarageYrBlt',
     'MasVnrArea',
     'overallScore',
     'WoodDeckSF',
     '1stFlrSF',
     'YearBuilt',
     'OverallCond',
     'ageScore',
     'TotalBsmtSF',
     'YearRemodAdd',
     'BathScore',
     'Neighborhood_StoneBr',
     'OpenPorchSF',
     'YrSold',
     'BsmtFinSF2',
     'Neighborhood_Crawfor',
     'ScreenPorch',
     'Functional_NonTyp',
     'Neighborhood_Somerst'
]

In [262]:
from keras import backend as K
from keras.optimizers import Adam, RMSprop
from keras import regularizers

In [4]:
%run -i model_functions.ipynb

In [155]:
# Function to call all feature engineering functions
def feature_engineering(df):
    print("Starting feature engineering...")
    df = year_to_age(df)
    df = build_meta_features(df)
    df = log_xform_features(df)
    print("Feature engineering complete.")
    return df

In [154]:
def log_xform(df):
    df = log_xform_features(df)
    return df

In [214]:
def run_model():
    
    train_data, test_data = get_data()
    target = train_data['SalePrice']
    train_data = remove_null_columns(train_data)
    numerical_features, categorical_features = get_feature_lists(train_data)
    
    df_num_train, df_cat_train = impute_features(train_data[numerical_features].copy(), train_data[categorical_features].copy())
    df_num_test, df_cat_test = impute_features(test_data[numerical_features].copy(), test_data[categorical_features].copy())
    
    df_num_train, df_num_test = feature_engineering(df_num_train.copy()), feature_engineering(df_num_test.copy())
    df_num_train, df_num_test = scale_features(df_num_train.copy()), scale_features(df_num_test.copy())
    
    common_categories = prune_categorical_features(df_cat_train, df_cat_test)
    
    df_train = combine_dataframes(df_num_train, df_cat_train[common_categories])
    df_test = combine_dataframes(df_num_test, df_cat_test[common_categories])
    
#     df_train = feature_engineering(df_train.copy())
#     df_test = feature_engineering(df_test.copy())
    
    # Change the line below to substitute different models
    model = keras_model(df_train, target=target)
    
    predictions = model.predict(df_test)
    
    create_submission(test_data, predictions)
    
    return model, df_train

In [264]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [216]:
def scale_features(df):
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df)
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
    return df_scaled

In [326]:
train_data, test_data = get_data()
target = train_data['SalePrice']
train_data = remove_null_columns(train_data)
numerical_features, categorical_features = get_feature_lists(train_data)

df_num_train, df_cat_train = impute_features(train_data[numerical_features].copy(), train_data[categorical_features].copy())
df_num_test, df_cat_test = impute_features(test_data[numerical_features].copy(), test_data[categorical_features].copy())

df_num_train, df_num_test = feature_engineering(df_num_train.copy()), feature_engineering(df_num_test.copy())
df_num_train, df_num_test = scale_features(df_num_train.copy()), scale_features(df_num_test.copy())

common_categories = prune_categorical_features(df_cat_train, df_cat_test)

df_train = combine_dataframes(df_num_train, df_cat_train[common_categories])
df_test = combine_dataframes(df_num_test, df_cat_test[common_categories])

df_train, df_test = df_train.copy()[top_features], df_test.copy()[top_features]

Grabbing train data from 'data/train.csv'...
Grabbing test data from 'data/test.csv'...
Data successfully loaded.
Removing columns that are >50% null...
4 columns were removed.
Obtaining feature column names...
34 numeric features obtained; 41 categorical features obtained
Imputing data...
Imputation complete.
There are now 34 numerical features and 177 categorical features.
Imputing data...
Imputation complete.
There are now 34 numerical features and 172 categorical features.
Starting feature engineering...
Feature engineering: transforming years to time since...
Feature engineering: log transform of relevant numerical features...
Feature engineering complete.
Starting feature engineering...
Feature engineering: transforming years to time since...
Feature engineering: log transform of relevant numerical features...
Feature engineering complete.


In [163]:
df_test.head()

Unnamed: 0,LotArea,LotFrontage,BsmtUnfSF,GrLivArea,OverallQual,HouseSFScore,BsmtFinSF1,GarageArea,PorchScore,BsmtIncompleteRatio,2ndFlrSF,GarageYrBlt,MasVnrArea,overallScore,WoodDeckSF,1stFlrSF,YearBuilt,OverallCond,ageScore,TotalBsmtSF,YearRemodAdd,BathScore,Neighborhood_StoneBr,OpenPorchSF,YrSold
0,0.5663,0.589174,0.730486,0.312002,0.444444,0.030477,0.741323,0.90262,1.3e-05,0.000206,0.0,0.788462,0.0,0.4,0.681466,0.312002,0.374046,0.625,0.383735,0.794653,0.816667,0.0,0,0.0,0.0
1,0.622472,0.59472,0.783517,0.467997,0.555556,0.068093,0.823054,0.78652,4e-06,0.000205,0.0,0.798077,0.654926,0.485714,0.82297,0.467997,0.396947,0.625,0.426024,0.842638,0.866667,0.047619,0,0.546224,0.0
2,0.61395,0.554385,0.642487,0.548554,0.444444,0.058288,0.804475,0.845899,4e-06,8.1e-05,0.870383,0.673077,0.0,0.328571,0.738274,0.325885,0.099237,0.5,0.042892,0.800602,0.2,0.126984,0,0.537818,0.0
3,0.52452,0.577872,0.75418,0.542433,0.555556,0.05727,0.771613,0.842455,4e-06,0.000251,0.865959,0.669872,0.425024,0.485714,0.810925,0.325032,0.091603,0.625,0.040482,0.80035,0.2,0.126984,0,0.546224,0.0
4,0.335525,0.313321,0.90306,0.45313,0.777778,0.063167,0.672061,0.852536,0.00126,0.0018,0.0,0.689103,0.0,0.542857,0.0,0.45313,0.137405,0.5,0.073735,0.83824,0.3,0.047619,1,0.668438,0.0


In [327]:
X_train, X_test, y_train, y_test = train_test_split(df_train, target, test_size=0.2, random_state=23)

In [238]:
# def basic_model_1(x_size, y_size):
#     t_model = Sequential()
#     t_model.add(Dense(100, activation="tanh", input_shape=(x_size,)))
#     t_model.add(Dense(50, activation="relu"))
#     t_model.add(Dense(y_size))
#     print(t_model.summary())
#     t_model.compile(loss='mean_squared_error',
#         optimizer=Adam(),
#         metrics=[metrics.mae])
#     return(t_model)

In [328]:
X_train.shape

(1168, 30)

In [240]:
model = Sequential()

In [241]:
model.add(Dense(100, activation='tanh', input_shape=(X_train.shape[1],)))

In [242]:
model.add(Dense(50, activation='relu'))

In [244]:
model.add(Dense(1))

In [245]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_195 (Dense)            (None, 100)               2600      
_________________________________________________________________
dense_196 (Dense)            (None, 50)                5050      
_________________________________________________________________
dense_197 (Dense)            (None, 1)                 51        
Total params: 7,701
Trainable params: 7,701
Non-trainable params: 0
_________________________________________________________________
None


In [247]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [251]:
def baseline_model():
#     adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model = Sequential()
    model.add(Dense(100, activation='tanh', input_shape=(X_train.shape[1],)))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [329]:
def basic_model_3():
    t_model = Sequential()
    t_model.add(Dense(80, activation="tanh", kernel_initializer='normal', input_shape=(X_train.shape[1],)))
    t_model.add(Dropout(0.3))
    t_model.add(Dense(120, activation="relu", kernel_initializer='normal', 
                      kernel_regularizer=regularizers.l1(0.01), bias_regularizer=regularizers.l1(0.01)))
    t_model.add(Dropout(0.2))
    t_model.add(Dense(20, activation="relu", kernel_initializer='normal', 
                      kernel_regularizer=regularizers.l1_l2(0.01), bias_regularizer=regularizers.l1_l2(0.01)))
    t_model.add(Dropout(0.2))
    t_model.add(Dense(10, activation="relu", kernel_initializer='normal'))
    t_model.add(Dropout(0.0))
    t_model.add(Dense(1))
    t_model.compile(loss='mean_squared_error', optimizer='adam')
    return(t_model)

In [296]:
model

<keras.engine.sequential.Sequential at 0x17f6ab40dd8>

In [330]:
estimator = KerasRegressor(build_fn=basic_model_3, nb_epoch=200, batch_size=5, verbose=0)

In [279]:
kfold = KFold(n_splits=10, random_state=42)

In [280]:
results = cross_val_score(estimator, X_train, y_train, cv=kfold, n_jobs=1)

In [281]:
print("Results: {} ({}) MSE".format(results.mean(), results.std()))

Results: -7878878956.754259 (2460623974.4745364) MSE


In [331]:
estimator.fit(X_train, y_train, epochs=4000, batch_size=128, verbose=0)

<keras.callbacks.History at 0x17f134da898>

In [306]:
score = estimator.evaluate(X_test, y_test, batch_size=128)

AttributeError: 'KerasRegressor' object has no attribute 'evaluate'

In [299]:
score

491640789.91780823

In [185]:
def r2_keras(y_true, y_pred):
    SS_res = K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return ( 1 - SS_res / (SS_tot + K.epsilon()) )

In [336]:
predictions = estimator.predict(df_test)
    
create_submission(test_data, predictions)

Generating submission file...
Submission file is ready.
