Baseline model, ignore review texts

In [0]:
# TensorFlow
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Activation, Dropout, Input, Embedding, BatchNormalization, Reshape, Concatenate

# Helper libraries
import pandas as pd
import numpy as np
import math

# Preprocessing data
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder


In [0]:
# Mount to google drive folder
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# Training data
df_train = pd.read_csv("/content/gdrive/My Drive/colab notebook/beer ratings/train.csv", index_col=['index'])

# Divide the values by 5 to match the outputs of sigmoid function in the output layer of the following model
y_train = df_train[['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste']].applymap(lambda x: x / 5)

# Test data
df_test = pd.read_csv("/content/gdrive/My Drive/colab notebook/beer ratings/test.csv", index_col=['index'])

In [0]:
def build_model(len_embed_cols):
    
  model_out = []
  model_in  = []

  # Embedding inputs for categorical features
  for dim in len_embed_cols:
      input_dim = Input(shape=(1,), dtype='int32')
      embed_dim = Embedding(dim, max(2, int(math.log(dim))), input_length=1)(input_dim)
      embed_dim = Dropout(0.25)(embed_dim)
      embed_dim = Reshape((max(2, int(math.log(dim))),))(embed_dim)
      model_out.append(embed_dim)
      model_in.append(input_dim)

  # Numerical features
  input_numeric = Input(shape=(3,), dtype='float32')
  model_in.append(input_numeric)

  # Combining the output of embedding model and numerical features as the input of the final model
  outputs = Concatenate(axis=1)([*model_out, input_numeric])
  outputs = (Dense(512))(outputs) 
  outputs = (BatchNormalization())(outputs)
  outputs = (Activation('relu'))(outputs)
  outputs = (Dropout(0.5))(outputs)
  outputs = (Dense(512))(outputs)
  outputs = (BatchNormalization())(outputs)
  outputs = (Activation('relu'))(outputs)
  outputs = (Dropout(0.5))(outputs)
  outputs = (Dense(128))(outputs) 
  outputs = (BatchNormalization())(outputs)
  outputs = (Activation('relu'))(outputs)
  outputs = (Dropout(0.5))(outputs)
  outputs = (Dense(5))(outputs)
  outputs = (Activation('sigmoid'))(outputs)

  model = Model(model_in, outputs)

  model.compile(#optimizer='rmsprop', 
                optimizer='sgd', 
                loss='mse',
                metrics=['mae', 'acc'])

  return model

In [0]:
def process_cat_cols(X_train, X_test, cat_cols):
  # We use entity embedding to deal with the categorical features
  col_vals_dict = {c: list(X_train[c].unique()) for c in cat_cols}
  print(col_vals_dict.keys())

  # LabelEncoder
  for c in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(X_train[c].values) + list(X_test[c].values))
    X_train[c] = lbl.transform(list(X_train[c].values))
    X_test[c] = lbl.transform(list(X_test[c].values))

  len_embed_cols = []
  for c in cat_cols:
    len_embed_cols.append(len(col_vals_dict[c]))
    print(c + ': %d values' % len(col_vals_dict[c])) #look at value counts to know the embedding dimensions

  print('\nNumber of categorical features :', len(cat_cols))
  
  return len_embed_cols

In [0]:
def process_numeric_cols(X_train, X_test, numeric_cols):
  # standardize beer/ABV, review/timeUnix, user/ageInSeconds
  scaler = StandardScaler()
  X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
  X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
  
  input_train.append(X_train[numeric_cols].values)
  input_test.append(X_test[numeric_cols].values)
  
  print('\nNumber of numerical features :', len(numeric_cols))

In [0]:
def preprocess_data(df_train, df_test):
  # Remove columns:
  #  beer/name: redundant to beer/beerId
  #  review/appearance, review/aroma, review/overall, review/palate, review/taste: columns to predict on test dataset
  #  review/timeStruct: redundant to review/timeUnix
  #  user/birthdayRaw, user/birthdayUnix: redundant to user/ageInSeconds
  #  review/text: for baseline model we ignore it for now
  X_train = df_train.drop(columns=['beer/name', 'review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste', 
                   'review/text', 'review/timeStruct', 'user/birthdayRaw', 'user/birthdayUnix'])
  X_test = df_test.drop(columns=['beer/name', 'review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste', 
                   'review/text', 'review/timeStruct', 'user/birthdayRaw', 'user/birthdayUnix'])
  
  # Input data for the NN model
  input_train = []
  input_test = [] 
  
  ### Process categorical features for the model input
  cat_cols = ['beer/beerId', 'beer/brewerId', 'beer/style', 'user/profileName', 'user/gender']
  # Fill the empty entries in gender with 'Unknown'
  X_train['user/gender'].fillna('Unknown', inplace=True)
  X_test['user/gender'].fillna('Unknown', inplace=True)
   
  len_embed_cols = process_cat_cols(X_train, X_test, cat_cols) 
  # Columns to be embedded: rescaling to range [0, # values)
  input_train.append(X_train[cat_cols].values)
  input_test.append(X_test[cat_cols].values)
  
  ### Process numerical features
  numeric_cols = ['beer/ABV', 'review/timeUnix', 'user/ageInSeconds']
  # Fill the entries in age column with the average age
  mean_age = X_train['user/ageInSeconds'].mean()
  X_train['user/ageInSeconds'].fillna(mean_age, inplace=True)
  X_test['user/ageInSeconds'].fillna(mean_age, inplace=True)
  
  process_numeric_cols(X_train, X_test, numeric_cols)
  
  input_train.append(X_train[numeric_cols].values)
  input_test.append(X_test[numeric_cols].values)
  
  return input_train, input_test, len_embed_cols

In [0]:
# Preprocess data
input_train, input_test, len_embed_cols = preprocess_data(df_train, df_test)

In [0]:
#Create model and train the NN

model = build_model(len_embed_cols)

model.fit(input_train, y_train,
	epochs=500,
	batch_size=32,
	validation_split=0.1,
  callbacks = [EarlyStopping(monitor='val_loss', patience=30)]
)


In [0]:
# Make prediction on the test data and multiply them by 5 to match the ratings, write the output in corresponding format
y_pred = np.vectorize(lambda x: 5 * x)(model.predict(input_test))
index = pd.DataFrame(data=df_test.index.values, columns=['index'])
result = pd.concat([index, pd.DataFrame(y_pred, columns=['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste'])], axis=1)
result.to_csv("/content/gdrive/My Drive/colab notebook/beer ratings/resultBaseline.csv", index = False)
