In [43]:
!pip install contractions textsearch



In [0]:
# TensorFlow
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Activation, Dropout, Input, Embedding, BatchNormalization, Reshape, Concatenate, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Helper libraries
import pandas as pd
import numpy as np
import math

# Preprocessing data
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder


# NLP denpendencies
import nltk
from nltk.tokenize import word_tokenize
import contractions
import re

# Maximal length of a review text
MAX_LEN = 250

In [45]:
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('no')
stopwords.remove('not')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
# Mount to google drive folder
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Training data
df_train = pd.read_csv("/content/gdrive/My Drive/colab notebook/beer ratings/train.csv", index_col=['index'])

# Divide the values by 5 to match the outputs of sigmoid function in the output layer of the following model
y_train = df_train[['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste']].applymap(lambda x: x / 5)

# Test data
df_test = pd.read_csv("/content/gdrive/My Drive/colab notebook/beer ratings/test.csv", index_col=['index'])

In [0]:
def build_model(len_embed_cols, num_words):
    
  model_out = []
  model_in  = []

  # Embedding inputs for categorical features
  for dim in len_embed_cols:
      input_dim = Input(shape=(1,), dtype='int32')
      embed_dim = Embedding(dim, max(2, int(math.log(dim))), input_length=1)(input_dim)
      embed_dim = Dropout(0.25)(embed_dim)
      embed_dim = Reshape((max(2, int(math.log(dim))),))(embed_dim)
      model_out.append(embed_dim)
      model_in.append(input_dim)

  # Numerical features
  input_numeric = Input(shape=(3,), dtype='float32')
  model_in.append(input_numeric)
  
  # Text feature
  input_text = Input(shape=(MAX_LEN, ), dtype='int32')
  embed_text = Embedding(num_words, 40, input_length=MAX_LEN)(input_text)
  embed_text = Dropout(0.25)(embed_text)
  embed_text = Flatten()(embed_text)
  embed_text = Dense(6)(embed_text)
  embed_text = Activation('relu')(embed_text)
  model_out.append(embed_text)
  model_in.append(input_text)

  # Combining the output of embedding model and numerical features as the input of the final model
  outputs = Concatenate(axis=1)([*model_out, input_numeric])
  outputs = Dense(512)(outputs) 
  outputs = BatchNormalization()(outputs)
  outputs = Activation('relu')(outputs)
  outputs = Dropout(0.5)(outputs)
  outputs = Dense(256)(outputs) 
  outputs = BatchNormalization()(outputs)
  outputs = Activation('relu')(outputs)
  outputs = Dropout(0.5)(outputs)
  outputs = Dense(128)(outputs) 
  outputs = BatchNormalization()(outputs)
  outputs = Activation('relu')(outputs)
  outputs = Dropout(0.5)(outputs)
  outputs = Dense(5)(outputs)
  outputs = Activation('sigmoid')(outputs)

  model = Model(model_in, outputs)

  model.compile(optimizer='sgd', 
                loss='mse',
                metrics=['mae', 'acc'])

  return model

In [0]:
def normalize_text(text):
  # Extract the text and lower the case
  text = str(text).lower()
  # Expand contractions
  text = contractions.fix(text)
  # Remove redundant spaces
  text = re.sub('[\s+]', ' ', text)
  # Remove special characters and numbers
  text = re.sub('[^a-zA-z\s]', '', text)
  # Tokenize the text
  tokens = word_tokenize(text)
  # Remove stopwords
  words = [token for token in tokens if token not in stopwords]
  # Join the words
  normalized_text = ' '.join(words)
  return normalized_text

In [0]:
def process_cat_cols(X_train, X_test, cat_cols):
  # We use entity embedding to deal with the categorical features
  col_vals_dict = {c: list(X_train[c].unique()) for c in cat_cols}
  print(col_vals_dict.keys())

  # LabelEncoder
  for c in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(X_train[c].values) + list(X_test[c].values))
    X_train[c] = lbl.transform(list(X_train[c].values))
    X_test[c] = lbl.transform(list(X_test[c].values))

  len_embed_cols = []
  for c in cat_cols:
    len_embed_cols.append(len(col_vals_dict[c]))
    print(c + ': %d values' % len(col_vals_dict[c])) #look at value counts to know the embedding dimensions

  print('\nNumber of categorical features :', len(cat_cols))
  
  return len_embed_cols
  

In [0]:
def process_numeric_cols(X_train, X_test, numeric_cols):
  # standardize beer/ABV, review/timeUnix, user/ageInSeconds
  scaler = StandardScaler()
  X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
  X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
  
  print('\nNumber of numerical features :', len(numeric_cols))

In [0]:
def process_text_col(X_train, X_test, text_col):
  # Extract the texts
  reviews_train = df_train[text_col]
  reviews_test = df_test[text_col]

  # Normalize text
  reviews_lines_train = reviews_train.map(normalize_text)
  reviews_lines_test = reviews_test.map(normalize_text)

  # Tokenize the text
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(reviews_lines_train)
  words_seq_train = tokenizer.texts_to_sequences(reviews_lines_train)
  words_seq_test = tokenizer.texts_to_sequences(reviews_lines_test)
  
  # Padding the sequence to MAX_LEN with 0 or truncate it if its length exceeds the limit
  words_seq_train_padded = pad_sequences(words_seq_train, maxlen=MAX_LEN)
  words_seq_test_padded = pad_sequences(words_seq_test, maxlen=MAX_LEN)
  num_words = len(tokenizer.word_index) + 1
  print('\nVocabulary length :', num_words)
  
  return num_words, words_seq_train_padded, words_seq_test_padded

In [0]:
def preprocess_data(df_train, df_test):
  # Remove columns:
  #  beer/name: brewerId together with beer/style approximately indicates the beer
  #  review/appearance, review/aroma, review/overall, review/palate, review/taste: columns to predict on test dataset
  #  review/timeStruct: redundant to review/timeUnix
  #  user/birthdayRaw, user/birthdayUnix: redundant to user/ageInSeconds
  X_train = df_train.drop(columns=['beer/name', 'review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste', 
                   'review/timeStruct', 'user/birthdayRaw', 'user/birthdayUnix'])
  X_test = df_test.drop(columns=['beer/name', 'review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste', 
                   'review/timeStruct', 'user/birthdayRaw', 'user/birthdayUnix'])
  
  # Input data for the NN model
  input_train = []
  input_test = [] 
  
  ### Process categorical features for the model input
  cat_cols = ['beer/beerId', 'beer/brewerId', 'beer/style', 'user/profileName', 'user/gender']
  # Fill the empty entries in gender with 'Unknown'
  X_train['user/gender'].fillna('Unknown', inplace=True)
  X_test['user/gender'].fillna('Unknown', inplace=True)
   
  len_embed_cols = process_cat_cols(X_train, X_test, cat_cols) 
  # Columns to be embedded: rescaling to range [0, # values)
  for c in cat_cols:
    input_train.append(X_train[c].values)
    input_test.append(X_test[c].values)
  
  ### Process numerical features
  numeric_cols = ['beer/ABV', 'review/timeUnix', 'user/ageInSeconds']
  # Fill the entries in age column with the average age
  mean_age = X_train['user/ageInSeconds'].mean()
  X_train['user/ageInSeconds'].fillna(mean_age, inplace=True)
  X_test['user/ageInSeconds'].fillna(mean_age, inplace=True)
  
  process_numeric_cols(X_train, X_test, numeric_cols)
  
  input_train.append(X_train[numeric_cols].values)
  input_test.append(X_test[numeric_cols].values)
  
  ### Process review/text 
  text_col = 'review/text'
  
  num_words, words_seq_train_padded, words_seq_test_padded = process_text_col(X_train, X_test, text_col)
    
  input_train.append(words_seq_train_padded)
  input_test.append(words_seq_test_padded)

  return input_train, input_test, len_embed_cols, num_words

In [54]:
# Preprocess data
input_train, input_test, len_embed_cols, num_words = preprocess_data(df_train, df_test)


dict_keys(['beer/beerId', 'beer/brewerId', 'beer/style', 'user/profileName', 'user/gender'])
beer/beerId: 1731 values
beer/brewerId: 212 values
beer/style: 95 values
user/profileName: 7442 values
user/gender: 3 values

Number of categorical features : 5

Number of numerical features : 3

Vocabulary length : 59191


In [0]:
# Create model and train the NN

model = build_model(len_embed_cols, num_words)

model.fit(input_train, y_train,
	epochs=400,
	batch_size=32,
	validation_split=0.1,
  callbacks = [EarlyStopping(monitor='val_loss', patience=50)]
)


Train on 33750 samples, validate on 3750 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400

In [0]:
# Make prediction on the test data and multiply them by 5 to match the ratings, write the output in corresponding format
y_pred = np.vectorize(lambda x: 5 * x)(model.predict(input_test))
index = pd.DataFrame(data=df_test.index.values, columns=['index'])
result = pd.concat([index, pd.DataFrame(y_pred, columns=['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste'])], axis=1)
result.to_csv("/content/gdrive/My Drive/colab notebook/beer ratings/result.csv", index = False)
