# Kaggle || Housing Prices: Advanced Regression Techniques

## Step 1: Import Packages

In [1]:
import numpy as np
import pandas as pd
import time
from datetime import datetime

from sklearn.model_selection import cross_val_score
#sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Step 2: Import Dataset

In [2]:
df_train = pd.read_csv('/Users/austinwhaley/github_repos/DSI-SF-4-austinmwhaley/other_datasets/kaggle_housing_train.csv')
labels = df_train['SalePrice']
df_train = df_train.drop(['SalePrice'], 1)
df_train.shape

(1460, 80)

In [3]:
df_test = pd.read_csv('/Users/austinwhaley/github_repos/DSI-SF-4-austinmwhaley/other_datasets/kaggle_housing_test.csv')
ids = df_test[['Id']]
df_test.shape

(1459, 80)

## Step 3: Clean Dataset

In [4]:
# Average is a special type of munging and needs to be done first
for i in [df_train, df_test]:
    avg_cat = ['GarageYrBlt']
    for j in avg_cat: #Average Categories
        avg = round(np.mean(i[j][i[j].isnull() == False]), 0)
        i[j] = i[j].map(lambda x: avg if pd.isnull(x) == True else x) #Turn NaN to avg

In [5]:
# For all the float64 type columns, map the null values as 0
for i in [df_train, df_test]:
    reg_cat = i.select_dtypes(include=['float64', 'int64'])
    for j in reg_cat.columns: #Regression Categories
        i[j] = i[j].map(lambda x: 0 if pd.isnull(x) == True else x) #Turn NaN to 0

In [6]:
# For all the object type columns, map the null values as 'None'
for i in [df_train, df_test]:
    cat_cat = i.select_dtypes(include=['object'])
    for j in cat_cat.columns: #Categorical Categories
        i[j] = i[j].map(lambda x: 'None' if pd.isnull(x) == True else x) #Turn NaN to 'None'

In [7]:
#Combine train and test datasets for processing
df_concat = pd.concat([df_train, df_test])

In [8]:
h_dum = pd.get_dummies(df_concat)
#h_train_dum = pd.get_dummies(df_train)
#h_test_dum = pd.get_dummies(df_test)
#print h_train_dum.shape
#print h_test_dum.shape
print h_dum.shape

(2919, 311)


In [10]:
X_train = h_dum.drop(['Id'], 1)[:1460]
X_test = h_dum.drop(['Id'], 1)[1460:]
y_train = labels

print 'X_train =', X_train.shape
print 'y_train =', y_train.shape

X_train = (1460, 310)
y_train = (1460,)


## Step 4: Exploritory Data Analysis (EDA)

## Step 5: Modeling

### 5.1: Generalized Linear Models

#### 5.1.1: Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
start_time = time.time()
LR = LinearRegression(n_jobs=-1).fit(X_train, y_train)
scores = cross_val_score(LR, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

0.593 = avg_r2
Runtime = 0.87 seconds


#### 5.1.2: Ridge Regression

In [11]:
from sklearn.linear_model import Ridge
start_time = time.time()
R = Ridge().fit(X_train, y_train)
scores = cross_val_score(R, X_train, y_train, cv=10, n_jobs=1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

0.837 = avg_r2
Runtime = 0.26 seconds


#### 5.1.3: Lasso Regression

In [12]:
from sklearn.linear_model import Lasso
start_time = time.time()
L = Lasso().fit(X_train, y_train)
scores = cross_val_score(L, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print np.mean(scores), '= avg_r2'
print round(time.time() - start_time, 2), 'seconds'



0.803524578118 = avg_r2
3.93 seconds


#### 5.1.4: Elastic Net Regression

In [13]:
from sklearn.linear_model import ElasticNet
start_time = time.time()
EN = ElasticNet().fit(X_train, y_train)
scores = cross_val_score(EN, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

0.816 = avg_r2
Runtime = 4.97 seconds


### 5.2: Decision Tress

#### 5.2.1: Decison Tree Regressor

In [14]:
from sklearn.tree import DecisionTreeRegressor
start_time = time.time()
DTR = DecisionTreeRegressor().fit(X_train, y_train)
scores = cross_val_score(DTR, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

0.765 = avg_r2
Runtime = 0.69 seconds


### 5.3: Ensemble Methods

#### 5.3.1: Random Forest

In [15]:
from sklearn.ensemble import RandomForestRegressor
start_time = time.time()
RFR = RandomForestRegressor(n_estimators=100, verbose=0, n_jobs=-1).fit(X_train, y_train)
scores = cross_val_score(RFR, X_train, y_train, cv=10)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

0.863 = avg_r2
Runtime = 19.94 seconds


#### 5.3.2: AdaBoost

In [16]:
from sklearn.ensemble import AdaBoostRegressor
start_time = time.time()
ADA = AdaBoostRegressor().fit(X_train, y_train)
scores = cross_val_score(ADA, X_train, y_train, cv=10, verbose=1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

0.794 = avg_r2
Runtime = 8.4 seconds


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.7s finished


### 5.4: Neural-Networks

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import rmsprop, Adamax

import keras.backend as K
#K._BACKEND == 'tensorflow'

def custom_r2(y_true, y_pred):
    baseline = K.sum((y_true - K.mean(y_true))**2)
    model_fit = K.sum((y_true - y_pred)**2)
    return 1. - model_fit/baseline

Using Theano backend.


In [43]:
def run_neural_network(model, X_train, y_train, n_epochs, batch_size=1, verbose=0, save_weights=0):
    '''
    Runs neural_network.

    Parameters
    ----------
    '''
    #try: 
    start_time = time.time()
    print 'Executing ||', str(datetime.now()) 

    history = model.fit(X_train, y_train, batch_size=batch_size, verbose=verbose, nb_epoch=n_epochs, shuffle=False)

    print 'Execution_Complete || Runtime w/', n_epochs, 'epochs =', round((time.time() - start_time)/60., 2), 'minutes'

#     if save_weights == 1:
#         save_unit_weights()

    return history
#     except KeyboardInterrupt:
#         pass
#     except:
#         print '--Error in running model---'

In [36]:
def plot_neural_net_results(history, limit_r2=0, save_results=0):
    '''
    Plots results of neural network.

    Parameters
    ----------
    '''
    results = pd.DataFrame(history.history)
    #results = pd.read_csv('/Users/austinwhaley/Desktop/DSI-SF-4-austinmwhaley/other_datasets/uber_results_mse_attn_1000').drop('Unnamed: 0', 1)

    # SUMMARY RESULTS
    print '---MAX RESULTS--- \n', results['custom_r2'].iloc[:].max(), '\n'
    print '---TAIL RESULTS--- \n', results['custom_r2'].iloc[-1:], '\n'

    # Build figure
    fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(12,4.5))

    # Assign Axis Plots
    ax0.plot(results['custom_r2'].iloc[:])
    ax1.plot(results['loss'].iloc[:])

    # Add horizontal and vertical line for max
    ax0.axhline(results['val_custom_r2'].max(), color='red', lw=0.5)
    ax0.axvline(results['val_custom_r2'].idxmax(), color='red', lw=0.5)

    #Set axis limits
    if limit_r2 == 1:
        ax0.set_ylim(bottom=0.5, top=0.6)

    # Rename axis titles
    ax0.set_title('r2', fontsize=15)
    ax1.set_title('loss', fontsize=15)

    # Show plots
    plt.show()

    if save_results == 1:
        file_name = str(datetime.now()).replace(':', '_').replace(' ','_')[:19]
        results.to_csv('/Users/austinwhaley/github_repos/DSI-SF-4-austinmwhaley/other_datasets/results_'+ file_name + '.csv')

In [37]:
def build_model(layers, n_features, dropout, n_outputs, compile_model=1, load_weights=0, loss='mse', lr=0.01):
    '''
    Builds a n-Layer neural_network 
    
    Parmaters
    ---------
    
    '''
    model = Sequential()
    
    if layers == 3:
        model.add(Dense(64, input_dim=n_features, activation='relu'))
        model.add(Dropout(dropout))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout))
        model.add(Dense(n_outputs, activation='relu'))
    elif layers == 5:
        pass
    elif layers == 10:
        pass
    

    if compile_model == 1:
        model.compile(loss=loss, optimizer=Adamax(lr=lr), metrics=[custom_r2])

    if load_weights == 1:
        model.load_weights('/Users/austinwhaley/github_repos/DSI-SF-4-austinmwhaley/other_datasets/weights_'+file_name+'.h5')
        
    return model

In [41]:
print X_train.shape
print y_train.shape

(1460, 310)
(1460,)


#### 5.4.1: 3-Layer Fully-Connected

In [45]:
model = build_model(layers=3, n_features=X_train.shape[1], dropout=0.5, n_outputs=1, compile_model=1)
history = run_neural_network(model, X_train, y_train, n_epochs=5, batch_size=64)
#print history
#plot_neural_net_results(history, save_results=0)

Executing || 2017-02-16 22:05:02.565215


ValueError: ('shapes (1460,64) and (310,64) not aligned: 64 (dim 1) != 310 (dim 0)', (1460, 64), (310, 64))
Apply node that caused the error: Dot22(dense_input_17, dense_49_W)
Toposort index: 2
Inputs types: [TensorType(float32, matrix), TensorType(float32, matrix)]
Inputs shapes: [(1460, 64), (310, 64)]
Inputs strides: [(4, 5840), (256, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Elemwise{add,no_inplace}(Dot22.0, InplaceDimShuffle{x,0}.0), Elemwise{Composite{(Abs(i0) + i1 + i2)}}[(0, 1)](Elemwise{add,no_inplace}.0, Dot22.0, InplaceDimShuffle{x,0}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

#### 5.4.2: 5-Layer Fully-Connected

#### 5.4.3: 10-Layer Fully-Connected

## Step 6: Visualizations

## Step 7: Conclusion/ Submission

In [None]:
predictions = pd.DataFrame(RFR.predict(X_test))
predictions.columns = ['SalePrice']
###
predictions.insert(0, 'Id', ids)
#predictions.set_index('Id', inplace=True)
#predictions.reset_index(inplace=True)
predictions.head()

In [None]:
predictions.to_csv('/Users/austinwhaley/Desktop/DSI-SF-4-austinmwhaley/other_datasets/kaggle_house_submission.csv', header=True, index=False)