<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/notebooks/M3_W1_ann_AirBnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# M3 - Deep Learning Workshop 1

In this workshop will revisit the [AirBnb dataset](http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz) that we used in M1. 

In [None]:
!pip install -q fancyimpute

In [None]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#from sklearn.metrics import mean_squared_error

# Import Keras libraries and metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.metrics import mean_squared_error
from keras.utils.vis_utils import plot_model

In [None]:
# import imputation
from fancyimpute import IterativeImputer

In [None]:
data = pd.read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2022-06-24/data/listings.csv.gz')

In [None]:
data['price'] = data.price.str.replace('$','')
data['price'] = data.price.str.replace(',','')

In [None]:
data['price'] = data['price'].astype('float')

In [None]:
mapping = {'f':False,'t':True}
data.replace({"instant_bookable":mapping}, inplace=True)

In [None]:
data = data[data.room_type.isin(['Private room', 'Entire home/apt'])]

In [None]:
data['price_z'] = (data['price'] - data['price'].mean())/data['price'].std(ddof=0)
data['price_z'] = data['price_z'].abs()
data = data[data.price_z < 2]

In [None]:
data = data[data.number_of_reviews_l30d >= 1]

In [None]:
data = data[data.review_scores_rating >= 4]

## Feature Engineering

- Selecting features
- creating dummies
- binning
- scaling

In [None]:
selected_df = data[['neighbourhood_cleansed','room_type','instant_bookable','accommodates','bedrooms','beds','minimum_nights_avg_ntm','price']]

In [None]:
selected_df = selected_df.dropna()
#selected_df = selected_df.dropna(subset=['bedrooms','beds'])

In [None]:
X = selected_df.iloc[:,:-1]

In [None]:
y = selected_df.price

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
import itertools

In [None]:
ohe_X = OneHotEncoder(sparse=False)

In [None]:
X_ohe = ohe_X.fit_transform(X.iloc[:,:2]) 

In [None]:
columns_X_ohe = list(itertools.chain(*ohe_X.categories_))

In [None]:
X_cat = pd.DataFrame(X_ohe, columns = columns_X_ohe)

In [None]:
X_cat

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
transformed_nummerical = scaler.fit_transform(X.iloc[:,2:])

In [None]:
X.iloc[:,2:] = transformed_nummerical

In [None]:
X

In [None]:
X.index = range(len(X))
X_cat.index = range(len(X_cat))

In [None]:
X_enc = X.iloc[:,2:].join(X_cat)

## Splittng and preparing for ML



In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2)

benchmark non-neural models

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_ols = LinearRegression()
model_ols.fit(X_train, y_train)

In [None]:
model_ols.score(X_train, y_train)

In [None]:
np.sqrt(mean_squared_error(y_train, model_ols.predict(X_train)))

In [None]:
model_ols.score(X_test, y_test)

In [None]:
np.sqrt(mean_squared_error(y_test, model_ols.predict(X_test)))

In [None]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor()

In [None]:
model_xgb.fit(X_train, y_train)

In [None]:
model_xgb.score(X_train, y_train)

In [None]:
model_xgb.score(X_test, y_test)

In [None]:
np.sqrt(mean_squared_error(y_test, model_xgb.predict(X_test)))

In [None]:
X_train.shape

baseline neural model

In [None]:
model = Sequential()
model.add(Dense(10,activation='relu',input_shape = (18,)))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs = 200,
                    batch_size = 64,
                    verbose=2, validation_split=0.1)

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('model MSE')
plt.ylabel('MSE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
eval = model.evaluate(X_train, y_train)

In [None]:
eval = model.evaluate(X_test, y_test)

In [None]:
eval

In [None]:
np.sqrt(eval[0])

## Prevent overfitting

In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape = (18,)))
model.add(Dense(16,activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)


In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs = 20,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:
eval = model.evaluate(X_test, y_test)
np.sqrt(eval[0])

![](https://miro.medium.com/max/1400/0*iNI8Oc80Eunm8NgI)

https://matthewmcateer.me/blog/optimal-brain-damage/
http://yann.lecun.com/exdb/publis/pdf/lecun-90b.pdf


In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape = (18,)))
model.add(Dropout(0.3)) # <<-- Added dropout
model.add(Dense(16,activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)


In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs = 20,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:
eval = model.evaluate(X_test, y_test)
np.sqrt(eval[0])

In [None]:
from keras.regularizers import l1

### regualization

In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape = (18,)))
model.add(Dense(16,activation='relu',  activity_regularizer=l1(0.001))) # <-- add activity regularizer
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)

history = model.fit(X_train, 
                    y_train,
                    epochs = 20,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:
eval = model.evaluate(X_test, y_test)
np.sqrt(eval[0])

## Bonus: Multi-branch architecture

This chunk is using the functional Keras API that is more flexible than the sequential model. Using this you can for instance use text and images together with tabular data for your model. 
Here I'm going to define two branches: One for all dummies the other one for the "normal" nummerical variables.

In [None]:
X_enc.head()

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate
from keras.utils.vis_utils import plot_model

In [None]:
# define two sets of inputs
inputA = Input(shape=(5,))
inputB = Input(shape=(13,))
# the first branch operates on the first input
x1 = Dense(8, activation="relu")(inputA)
x1 = Dense(4, activation="relu")(x1)
x1 = Model(inputs=inputA, outputs=x1)
# the second branch opreates on the second input
x2 = Dense(64, activation="relu")(inputB)
x2 = Dense(32, activation="relu")(x2)
x2 = Dense(4, activation="relu")(x2)
x2 = Model(inputs=inputB, outputs=x2)
# combine the output of the two branches
combined = concatenate([x1.output, x2.output])
# apply a FC layer and then a regression prediction on the
# combined outputs
y = Dense(2, activation="relu")(combined)
y = Dense(1, activation="linear")(y)
# our model will accept the inputs of the two branches and
# then output a single value
model = Model(inputs=[x1.input, x2.input], outputs=y)

In [None]:
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)

In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit([X_train.iloc[:,:5],X_train.iloc[:,5:]], 
                    y_train,
                    epochs = 100,
                    batch_size = 64,
                    verbose=1, validation_split=0.1)

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('model MSE')
plt.ylabel('MSE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
eval = model.evaluate([X_test.iloc[:,:5],X_test.iloc[:,5:]], y_test)
np.sqrt(eval[0])