### Project Steps
1. 'Unpack Data'
2. Put data into easy to deal with format (Pandas dataframe?)
3. Visualize data

### Dependencies
- Sklearn
- Pandas
- Numpy
- Keras
- Tensorflow
- pydot (for visualization)

### Global variables
- train_data (dataframe)
- test data (dataframe)
- X (training data from train_data [slice of dataframe])
- Y (training data from train_data [slice of dataframe])
- X_test (test data from test_data)

***No Y_test is given by Kaggle, Kaggle hides the solution to the test data to keep users honest in the competition

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import linear_model

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

train_data = pd.read_csv('House_Data/train.csv')
test_data = pd.read_csv('House_Data/test.csv')

Using TensorFlow backend.


In [2]:
# Creating list of categorical variables

def find_categorical(data):
    list_categorical = []
    for col in list(data.columns):
        if str(data[col].dtype) != 'int64' and str(data[col].dtype) != 'float64':
            list_categorical.append(col)
    return list_categorical

categorical_columns = find_categorical(train_data)
categorical_columns

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [3]:
# Replace categorical variables with quantitative ones (via one hot encoding)

def replace_categorical(data,categorical_columns):
    for col in categorical_columns:
        
        #create new dataframe from one hot encoding
        one_hot = pd.DataFrame(pd.get_dummies(data[col]))
        
        # rename dataframe columns to add the original column name so we know where the new column came from
        one_hot.columns = [str(one_hot_column) + '_' + col for one_hot_column in one_hot.columns]
        
        # join the one-hot column to the original dataframe
        data = data.join(one_hot)
        if 'C (all)_' + col in data.columns:
            to_drop = 'C (all)_' + col
            data.drop(to_drop, axis=1)
        
        data = data.drop(col,axis=1)
        # return the new table
    return data


In [4]:
def prep_data(data):
    # fill the NaN with the mean of the column
    data.fillna(data.mean(),inplace=True)
    # Drop the 'Id' column since we already have an index and do not need it messing up our model
    data = data.drop('Id',axis=1)
    return data

In [5]:
train_data = replace_categorical(train_data,categorical_columns)
train_data = prep_data(train_data)

test_data = replace_categorical(test_data,categorical_columns)
test_data.fillna(test_data.mean(),inplace=True)

In [6]:
def check_overlap(data1,data2):
    non_overlap = []
    for col in data1.columns:
        if col not in list(data2.columns):
            non_overlap.append(col)
    return non_overlap

columns_in_train_not_in_test = check_overlap(train_data,test_data)

In [7]:
def train_data_X_and_Y(data,columns_missing,goal_column):
    X = data.drop(columns_missing,axis=1)
    Y = data[goal_column]
    return X,Y

X,Y = train_data_X_and_Y(train_data,columns_in_train_not_in_test,'SalePrice')
X_test = test_data.drop('Id',axis=1)

In [8]:
# Create multilinear regression model using sci-kit learn
model = linear_model.LinearRegression()
model.fit(X,Y)
print('Intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Intercept: 
 663934.2701922326
Coefficients: 
 [-1.10718919e+02 -9.03559657e+01  5.49465066e-01  7.77551036e+03
  5.26389579e+03  2.24726804e+02  5.87612163e+01  2.48382139e+01
  3.92033626e+00  5.13753472e+00 -2.49032345e+00  6.56756279e+00
  2.10764335e+01  3.19424617e+01 -3.04818321e+01  2.25369886e+01
  3.71108750e+03  6.32918270e+02  5.66626789e+03  1.11295310e+03
 -2.99138526e+03 -1.23904646e+04  2.70879556e+03  4.35529269e+03
 -5.35183742e+01  1.02629253e+04  2.56397430e+00  1.89713056e+01
  7.41281932e+00  1.62544596e+00  4.86367132e+01  2.81403620e+01
  7.43703885e+01  3.66781420e+00 -3.63675229e+02 -6.87495344e+02
 -2.42837411e+04  1.16674336e+04  4.30540086e+03  5.82816828e+03
  2.48273838e+03 -1.52657937e+04  1.52657937e+04  1.19511989e+03
  2.55784423e+03  2.24676808e+03  4.80347316e+03 -1.07818332e+04
  3.73159201e+03 -6.93839759e+03  6.08723310e+03 -4.36090358e+03
  5.21206807e+03  6.16371120e+04  4.07883733e+03  1.42973905e+04
 -5.60548460e+03 -1.45963412e+04  1.8255979

In [9]:
def create_submission(data, prediction, pred_num, prediction_column='SalePrice'):
    submission = data['Id']
    submission = pd.DataFrame(submission)
    submission[prediction_column] = prediction
    sub_str = 'submission' + str(pred_num) + '.csv'
    submission.to_csv(sub_str,index=False)

In [10]:
Y_prediction = model.predict(X_test)
create_submission(test_data,Y_prediction,1)

### First Trial Summary -- Simple Linear Regression landed me with a score of ~.4639 on Kaggle... Let's see how we can improve

In [11]:
# Different type of linear model (Least Absolute Shrinkage Selector Operator), which automatically does feature selection

# alpha balances the amount of emphasis given to minimizing RSS vs minimizing sum of square of coefficients
lasso = linear_model.Lasso(alpha=10)
lasso.fit(X,Y)
y_lasso_pred = lasso.predict(X_test)

create_submission(test_data,y_lasso_pred,2)

### Second Trial Summary - Lasso Regression landed me with a ~.4611

In [12]:
# Deep Learning model with Keras

def deep_learning_model():
    model = Sequential()
    # input dimensions is the number of independent variables (all the columns in train_data except for SalePrice)
    # activation = output of node (neuron) = exponential linear unit (after testing it yielded better results)
    model.add(Dense(135, input_dim=270, kernel_initializer='normal', activation='elu'))
    model.add(Dense(1, kernel_initializer='normal'))
    
    # Compile model (configure for training)
    # optimizer 'adam' was chosen because it (on average) is the speediest
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=deep_learning_model, epochs=100, batch_size=5, verbose=0)

estimator.fit(X,Y)
y_keras_pred = estimator.predict(X_test)

create_submission(test_data,y_keras_pred,3)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


### Third Trial Summary -- Big improvement! Deep learning received a score on Kaggle of 0.207

In [13]:
def deep_learning_model2():
    model = Sequential()
    # input dimensions is the number of independent variables (all the columns in train_data except for SalePrice)
    # activation = output of node (neuron) = exponential linear unit (after testing it yielded better results)
    model.add(Dense(135, input_dim=270, kernel_initializer='normal', activation='elu'))
    model.add(Dense(135, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    
    # Compile model (configure for training)
    # optimizer 'adam' was chosen because it (on average) is the speediest
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# evaluate model with standardized dataset
estimator2 = KerasRegressor(build_fn=deep_learning_model2, epochs=100, batch_size=5, verbose=0)

estimator2.fit(X,Y)
y_keras_pred2 = estimator2.predict(X_test)

create_submission(test_data,y_keras_pred2,4)

In [20]:
deep_learning_model2().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 135)               36585     
_________________________________________________________________
dense_7 (Dense)              (None, 135)               18360     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 136       
Total params: 55,081
Trainable params: 55,081
Non-trainable params: 0
_________________________________________________________________


### Fourth Trial-- added another layer to my neural network, which gave me a score on Kaggle of 0.164