# Neural Network Pre-Outlier Removal

This notebook contains the model results before removing outliers in the dataset.

---

In [None]:
# Read in csv files
import io
from google.colab import files

# Foundational packages
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GRU, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Scoring
from sklearn.metrics import r2_score

# Upload Files

In [None]:
# Upload clean X train csv file
uploaded1 = files.upload()

Saving clean_X_train.csv to clean_X_train.csv


In [None]:
# Upload clean y train csv file
uploaded2 = files.upload()

Saving clean_y_train.csv to clean_y_train.csv


In [None]:
# Upload clean X test csv file
uploaded3 = files.upload()

Saving clean_X_test.csv to clean_X_test.csv


In [None]:
# Upload clean y test csv file
uploaded4 = files.upload()

Saving clean_y_test.csv to clean_y_test.csv


In [None]:
# Read in all of the csv files
X_train = pd.read_csv(io.BytesIO(uploaded1['clean_X_train.csv']), index_col=0)
y_train = pd.read_csv(io.BytesIO(uploaded2['clean_y_train.csv']), index_col=0)
X_test = pd.read_csv(io.BytesIO(uploaded3['clean_X_test.csv']), index_col=0)
y_test = pd.read_csv(io.BytesIO(uploaded4['clean_y_test.csv']), index_col=0)

In [None]:
# Take a look at my dataframes
display(X_train)
display(y_train.head())
display(X_test.head())
display(y_test.head())

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,"('location_Central Oahu',)","('location_East Honolulu',)","('location_Ewa',)","('location_Hamakua',)","('location_Hana',)","('location_Hanapepe-Eleele',)","('location_Kapaa-Wailua',)","('location_Kau',)","('location_Kihei-Makena',)","('location_Koloa-Poipu',)","('location_Koolauloa',)","('location_Koolaupoko',)","('location_Lahaina',)","('location_Lanai',)","('location_Lihue',)","('location_Makawao-Pukalani-Kula',)","('location_Molokai',)","('location_North Hilo',)","('location_North Kohala',)","('location_North Kona',)","('location_North Shore Kauai',)","('location_North Shore Oahu',)","('location_Paia-Haiku',)","('location_Primary Urban Center',)","('location_Puna',)","('location_South Hilo',)","('location_South Kohala',)",...,"('amenity_Ricemaker',)","('amenity_Roku',)","('amenity_Room-darkeningshades',)","('amenity_Safe',)","('amenity_Selfcheck-in',)","('amenity_Shampoo',)","('amenity_Sharedgardenorbackyard',)","('amenity_Sharedgyminbuilding',)","('amenity_Sharedhottub',)","('amenity_Sharedoutdoorheatedpool',)","('amenity_Sharedoutdoorpool',)","('amenity_Showergel',)","('amenity_Singlelevelhome',)","('amenity_Smartlock',)","('amenity_Smokealarm',)","('amenity_Soundsystem',)","('amenity_Stainlesssteelelectricstove',)","('amenity_Stainlesssteeloven',)","('amenity_StartUpSupplyOnlyconditioner',)","('amenity_StartUpSupplyOnlyshampoo',)","('amenity_Stove',)","('amenity_TVwithNetflix',)","('amenity_TVwithstandardcable',)","('amenity_Tablecornerguards',)","('amenity_Toaster',)","('amenity_Trashcompactor',)","('amenity_Washer',)","('amenity_Washer\\u2013\\u00a0Inbuilding',)","('amenity_Washer\\u2013\\u00a0Inunit',)","('amenity_Waterfront',)","('amenity_Wifi\\u2013100Mbps',)","('amenity_Wifi\\u2013200Mbps',)","('amenity_Wifi\\u2013300Mbps',)","('amenity_Wifi\\u2013400Mbps',)","('amenity_WindowACunit',)","('amenity_Windowguards',)","('amenity_Wineglasses',)","('amenity_premiumcable',)","('amenity_standardcable',)",sentiment_polarity
15336,21.29081,-157.83992,3,1.0,1,1,89,9,9,9,9,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.179459
4116,19.62870,-155.98652,6,2.0,3,4,100,10,10,10,10,10,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.500000
14298,20.97027,-156.67791,6,2.0,2,2,100,10,10,10,10,10,8,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.466667
9523,19.55464,-155.96514,8,3.0,3,3,100,9,10,10,9,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.313273
3836,20.92296,-156.69455,6,2.0,2,3,100,9,10,10,10,10,10,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.298500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,22.21884,-159.49600,4,1.0,1,2,80,9,8,9,8,10,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.283618
5192,21.87091,-159.44859,4,2.0,2,3,100,10,10,8,10,10,10,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.364286
12172,20.96023,-156.68489,4,1.0,1,2,99,10,10,10,10,10,10,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.340160
235,21.87562,-159.45014,14,4.0,6,8,90,9,9,10,10,10,10,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.317527


Unnamed: 0,price
15336,99.0
4116,130.0
14298,261.0
9523,320.0
3836,519.0


Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,"('location_Central Oahu',)","('location_East Honolulu',)","('location_Ewa',)","('location_Hamakua',)","('location_Hana',)","('location_Hanapepe-Eleele',)","('location_Kapaa-Wailua',)","('location_Kau',)","('location_Kihei-Makena',)","('location_Koloa-Poipu',)","('location_Koolauloa',)","('location_Koolaupoko',)","('location_Lahaina',)","('location_Lanai',)","('location_Lihue',)","('location_Makawao-Pukalani-Kula',)","('location_Molokai',)","('location_North Hilo',)","('location_North Kohala',)","('location_North Kona',)","('location_North Shore Kauai',)","('location_North Shore Oahu',)","('location_Paia-Haiku',)","('location_Primary Urban Center',)","('location_Puna',)","('location_South Hilo',)","('location_South Kohala',)",...,"('amenity_Ricemaker',)","('amenity_Roku',)","('amenity_Room-darkeningshades',)","('amenity_Safe',)","('amenity_Selfcheck-in',)","('amenity_Shampoo',)","('amenity_Sharedgardenorbackyard',)","('amenity_Sharedgyminbuilding',)","('amenity_Sharedhottub',)","('amenity_Sharedoutdoorheatedpool',)","('amenity_Sharedoutdoorpool',)","('amenity_Showergel',)","('amenity_Singlelevelhome',)","('amenity_Smartlock',)","('amenity_Smokealarm',)","('amenity_Soundsystem',)","('amenity_Stainlesssteelelectricstove',)","('amenity_Stainlesssteeloven',)","('amenity_StartUpSupplyOnlyconditioner',)","('amenity_StartUpSupplyOnlyshampoo',)","('amenity_Stove',)","('amenity_TVwithNetflix',)","('amenity_TVwithstandardcable',)","('amenity_Tablecornerguards',)","('amenity_Toaster',)","('amenity_Trashcompactor',)","('amenity_Washer',)","('amenity_Washer\\u2013\\u00a0Inbuilding',)","('amenity_Washer\\u2013\\u00a0Inunit',)","('amenity_Waterfront',)","('amenity_Wifi\\u2013100Mbps',)","('amenity_Wifi\\u2013200Mbps',)","('amenity_Wifi\\u2013300Mbps',)","('amenity_Wifi\\u2013400Mbps',)","('amenity_WindowACunit',)","('amenity_Windowguards',)","('amenity_Wineglasses',)","('amenity_premiumcable',)","('amenity_standardcable',)",sentiment_polarity
5341,20.95209,-156.68902,7,3.0,3,5,100,10,10,10,10,10,10,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0.366563
12634,20.76369,-155.99276,2,1.0,1,2,94,10,10,10,10,10,9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.360092
8947,21.28854,-157.80368,4,1.0,2,2,99,10,10,10,10,10,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.370724
10258,22.22768,-159.4751,4,2.0,2,3,88,10,9,10,10,10,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.380278
9513,20.92746,-156.69267,2,1.0,1,1,100,10,10,9,10,10,9,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.461886


Unnamed: 0,price
5341,492.0
12634,57.0
8947,150.0
10258,210.0
9513,173.0


## Exploring Neural Networks w/o Dimension Reduction

Let's take a look at a simple neural network model and see if there are any model adjustments that need to be done. As with many models, neural networks benefit from scaling of the data. I will use the `MinMaxScaler` to keep the values between 0 and 1 to maintain the distance between the variables the same, although which scaler I use likely would not greatly affect my outcomes as my features are mostly encoded.

In [None]:
# Scale the data
minmaxscaler = MinMaxScaler().fit(X_train)
X_train = minmaxscaler.transform(X_train)
X_test = minmaxscaler.transform(X_test)

I will use 2 hidden layers with 50 nodes each and my output layer to generate predictions. In terms of the activation, `relu` will be used to get a quick output.

In [None]:
# Create a new sequential model
model = keras.Sequential()

# Declare the hidden layers
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(64, activation="relu"))

# Declare the output layer
model.add(layers.Dense(1))

# Compile the model
model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.MeanAbsoluteError()
)

In [None]:
history = model.fit(X_train, y_train, epochs=10, verbose=2, validation_split=0.2)

Epoch 1/10
322/322 - 1s - loss: 154.3247 - val_loss: 105.0415
Epoch 2/10
322/322 - 1s - loss: 109.4109 - val_loss: 91.8943
Epoch 3/10
322/322 - 1s - loss: 100.9895 - val_loss: 86.2908
Epoch 4/10
322/322 - 0s - loss: 96.4660 - val_loss: 83.0765
Epoch 5/10
322/322 - 1s - loss: 93.4424 - val_loss: 81.5585
Epoch 6/10
322/322 - 0s - loss: 91.3875 - val_loss: 79.4693
Epoch 7/10
322/322 - 1s - loss: 89.9514 - val_loss: 78.7040
Epoch 8/10
322/322 - 0s - loss: 88.8178 - val_loss: 77.3443
Epoch 9/10
322/322 - 1s - loss: 87.5598 - val_loss: 76.6964
Epoch 10/10
322/322 - 0s - loss: 86.6581 - val_loss: 76.2034


In [None]:
# Take a look at the loss and get prediction score
training_loss = history.history["loss"][-1]
test_results = model.evaluate(X_test, y_test, verbose=0)

print(f"Train Loss: {training_loss:.4f}")
print(f"Test Loss: {test_results:.4f}")

Train Loss: 86.6581
Test Loss: 94.3982


In [None]:
# R2 Score
y_predict = model.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_predict)}")

R2 Score: 0.08914207352092796


I get a relatively high mean absolute error (loss) here which suggests that our model very poorly fits with our data; however the $ R^2 $ score indicates that the neural network is able to predict a positive relationship between my features and the AirBnB listing `price`.

Since my loss is very high, I will first add 2 more hidden layers, a greater amount of nodes and dropout to see how much the loss is brought down.

In [None]:
# Create a new sequential model
model2 = keras.Sequential()

# Declare the hidden layers
model2.add(layers.Dense(512, activation="relu"))
model2.add(Dropout(0.10))
model2.add(layers.Dense(512, activation="relu"))
model2.add(Dropout(0.10))
model2.add(layers.Dense(512, activation="relu"))
model2.add(Dropout(0.10))
model2.add(layers.Dense(512, activation="relu"))
model2.add(Dropout(0.10))

# Declare the output layer
model2.add(layers.Dense(1))

# Compile the model
model2.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.MeanAbsoluteError()
)

In [None]:
history2 = model2.fit(X_train, y_train, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Take a look at the loss and get prediction score
training_loss = history2.history["loss"][-1]
test_results = model2.evaluate(X_test, y_test, verbose=0)

print(f"Train Loss: {training_loss:.4f}")
print(f"Test Loss: {test_results}")

Train Loss: 73.7664
Test Loss: 101.75892639160156


In [None]:
# R2 Score
y_predict = model2.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_predict)}")

R2 Score: 0.1300899542429812


I can see that the loss is taken down to a greater amount and the $ R^2 $ score has increased. It seems that the loss values tend to plateau around 70 and may even be overfitting to the train data, so let's explore a different approach with a lower amount of nodes as the previous model seemed to continuously reduce validation loss.

In [None]:
# Create a new sequential model
model3 = keras.Sequential()

# Declare the hidden layers
model3.add(layers.Dense(256, activation="relu"))
model3.add(Dropout(0.10))
model3.add(layers.Dense(128, activation="relu"))
model3.add(Dropout(0.10))
model3.add(layers.Dense(256, activation="relu"))
model3.add(Dropout(0.10))
model3.add(layers.Dense(128, activation="relu"))
model3.add(Dropout(0.10))
model3.add(layers.Dense(256, activation="relu"))
model3.add(Dropout(0.10))

# Declare the output layer
model3.add(layers.Dense(1))

# Compile the model
model3.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.MeanAbsoluteError()
)

In [None]:
history3 = model3.fit(X_train, y_train, epochs=10, verbose=1, validation_split=0.2, batch_size=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Take a look at the loss and get prediction score
training_loss = history3.history["loss"][-1]
test_results = model3.evaluate(X_test, y_test, verbose=0)

print(f"Train Loss: {training_loss:.4f}")
print(f"Test Loss: {test_results}")

Train Loss: 76.9727
Test Loss: 86.79117584228516


In [None]:
# R2 Score
y_predict = model3.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_predict)}")

R2 Score: 0.11435216642083823


This model seems to perform better than all of the previous models, but also indicates that I may have to reduce the dimensions of the model as changes to neural network models have not improved my results significantly. Recall that `n_components` of 5 captures was found in the PCA analysis to capture > 97% of the variance of my data. I would expect that using the PCA-transformed data will improve the neural network results as the data contains only the most important information possible. Let's take a look:

## Exploring Neural Networks w/ PCA

In [None]:
# Use PCA to reduce the data
pca = PCA(n_components=5)
pca.fit(X_train)
transformed_X_train = pca.transform(X_train)
transformed_X_test = pca.transform(X_test)

In [None]:
# Rerun neural network on transformed data
# Create a new sequential model
PCA_transformed_model = keras.Sequential()

# Declare the hidden layers
PCA_transformed_model.add(layers.Dense(64, activation="relu"))
PCA_transformed_model.add(layers.Dense(32, activation="relu"))
PCA_transformed_model.add(layers.Dense(64, activation="relu"))
PCA_transformed_model.add(layers.Dense(32, activation="relu"))
PCA_transformed_model.add(layers.Dense(64, activation="relu"))

# Declare the output layer
PCA_transformed_model.add(layers.Dense(1))

# Compile the model
PCA_transformed_model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.MeanAbsoluteError()
)

In [None]:
PCA_history = PCA_transformed_model.fit(transformed_X_train, y_train, epochs=10, verbose=2, validation_split=0.2)

Epoch 1/10
322/322 - 1s - loss: 149.7061 - val_loss: 112.1968
Epoch 2/10
322/322 - 0s - loss: 123.6945 - val_loss: 111.8428
Epoch 3/10
322/322 - 0s - loss: 122.6989 - val_loss: 110.8207
Epoch 4/10
322/322 - 0s - loss: 122.0563 - val_loss: 109.9633
Epoch 5/10
322/322 - 0s - loss: 121.5306 - val_loss: 111.6290
Epoch 6/10
322/322 - 0s - loss: 121.2108 - val_loss: 109.2399
Epoch 7/10
322/322 - 0s - loss: 120.4561 - val_loss: 108.7958
Epoch 8/10
322/322 - 0s - loss: 120.3302 - val_loss: 108.5750
Epoch 9/10
322/322 - 0s - loss: 120.0653 - val_loss: 108.6445
Epoch 10/10
322/322 - 0s - loss: 119.7785 - val_loss: 109.6730


In [None]:
# Take a look at the loss and get prediction score
training_loss = PCA_history.history["loss"][-1]
test_results = PCA_transformed_model.evaluate(transformed_X_test, y_test, verbose=0)

print(f"Train Loss: {training_loss:.4f}")
print(f"Test Loss: {test_results:.4f}")

Train Loss: 119.7785
Test Loss: 124.6331


In [None]:
# R2 Score
y_predict = PCA_transformed_model.predict(transformed_X_test)
print(f"R2 Score: {r2_score(y_test, y_predict)}")

R2 Score: 0.018872082264153045


I can see that the train and test loss are much closer now, but the $R^2$ score is very poor and the validation loss is overall higher than before after using PCA.