In [1]:
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.2.3)


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
from collections import Counter
import os, re, pickle, random
from os.path import join as joinpath
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

seed = 0
random.seed(seed)
np.random.seed(seed)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import class_weight

from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup

import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

from keras.layers import *
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras import callbacks  
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text
from tensorflow.keras.models import load_model

from tensorflow.python.client import device_lib ; device_lib.list_local_devices()

Using TensorFlow backend.


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 955809651178628472, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 16425924604354986932
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 13552247149160060775
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11330115994
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 2994880673032347943
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]

In [4]:
try:
  print(os.getcwd())
  os.chdir('drive/My Drive/Colab Notebooks/crowdflower-search-relevance')
  print('Changed directory')
  print(os.getcwd())
except:
  print('Cannot change directory')
  print(os.getcwd())

/content
Changed directory
/content/drive/My Drive/Colab Notebooks/crowdflower-search-relevance


In [5]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,Concept Housewares WR-44526 Solid-Wood Ceiling...,"Like a silent and sturdy tree, the Southern En...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,"WTGR1011\nFeatures\nNickel base, 60,000 averag...",2,0.471


In [6]:
Counter(df_train.median_relevance)

Counter({1: 774, 2: 1476, 3: 1737, 4: 6171})

In [7]:
df_train.isna().sum()

id                        0
query                     0
product_title             0
product_description    2444
median_relevance          0
relevance_variance        0
dtype: int64

In [8]:
df_test.isna().sum()

id                        0
query                     0
product_title             0
product_description    5427
dtype: int64

In [9]:
df_train.fillna('', inplace=True)
df_test.fillna('', inplace=True)
df_train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,Concept Housewares WR-44526 Solid-Wood Ceiling...,"Like a silent and sturdy tree, the Southern En...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,"WTGR1011\nFeatures\nNickel base, 60,000 averag...",2,0.471


In [0]:
def remove_numbers(val):
    m = re.findall(r'([A-Za-z]\w+)', val, flags=re.I)
    return ' '.join(m)

# stemmer = PorterStemmer()

def text_cleaner(x, append):
  x = BeautifulSoup(x).get_text(" ")
  x = re.sub("[^a-zA-Z0-9]", " ", x)
  # x = ' '.join([stemmer.stem(w) for w in x.split()])
  if append:
    return ' '.join([append+w for w in x.split()])
  else:
    return x

In [11]:
for col in ['query', 'product_title', 'product_description']:
  if col == 'query':
    append = None
  elif col == 'product_title':
    append = None
  elif col == 'product_description':
    append = None
  df_train[col] = df_train[col].progress_apply(lambda x: text_cleaner(x, append))
  df_test[col] = df_test[col].progress_apply(lambda x: text_cleaner(x, append))

# df_train['product_title'] = df_train[['product_title', 'product_description']].apply(lambda x: ' '.join(x), axis=1)
# df_test['product_title'] = df_test[['product_title', 'product_description']].apply(lambda x: ' '.join(x), axis=1)
df_train.head()

100%|██████████| 10158/10158 [00:04<00:00, 2495.49it/s]
100%|██████████| 22513/22513 [00:08<00:00, 2635.78it/s]
100%|██████████| 10158/10158 [00:03<00:00, 2708.19it/s]
100%|██████████| 22513/22513 [00:08<00:00, 2596.16it/s]
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
100%|██████████| 10158/10158 [00:03<00:00, 3247.78it/s]
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
100%|██████████| 22513/22513 [00:08<00:00, 2530.18it/s]


Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design Red Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,Concept Housewares WR 44526 Solid Wood Ceiling...,Like a silent and sturdy tree the Southern En...,4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,WTGR1011 Features Nickel base 60 000 average ...,2,0.471


In [12]:
df_negative = df_train.copy()
df_negative['query'] = np.random.permutation(df_train['query'].values)
df_negative['median_relevance'] = 1
df_negative = df_negative.sample(frac=0.3)
df_negative.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
1086,3497,duck dynasty,Cuff Luv Peach Tie Dye Infinity Scarf,Add a touch of warmth to any outfit with this ...,1,0.0
775,2532,gucci guilty intense women,Somedays Lovin The Bunker Tie Dye Dress Dres...,,1,0.0
3231,10339,mens leather shirt,LED USB Essential Oil Ultrasonic Air Humidifie...,,1,0.0
2534,8072,rachel ray cookware,Kitchen Garden Propagator,Start seeds for your outdoor garden in this co...,1,0.894
321,1026,galaxy note 3,Rachael Ray Orange Porcelain Enamel Cookware 5...,Whether you re saute ing foods or preparing ri...,1,0.0


In [13]:
df_train = pd.concat([df_train, df_negative], ignore_index=True, sort=False)
df_train.shape

(13205, 6)

In [14]:
Counter(df_train.median_relevance)

Counter({1: 3821, 2: 1476, 3: 1737, 4: 6171})

In [15]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

train = {}
test = {}

for col in tqdm(['query', 'product_title', 'product_description']):
  train[col] = np.array(embedder.encode(df_train[col]))
  test[col] = np.array(embedder.encode(df_test[col]))

100%|██████████| 3/3 [10:59<00:00, 201.49s/it]


In [16]:
x_train = np.hstack([train['query'], train['product_title'], train['product_description']])
# x_train_neg = np.hstack([np.random.permutation(train['query']), train['product_title'], train['product_description']])
# x_train = np.vstack([x_train, x_train_neg]

x_test = np.hstack([test['query'], test['product_title'], test['product_description']])
x_train.shape

(13205, 2304)

In [0]:
# import keras.backend as K
# # https://www.kaggle.com/christofhenkel/weighted-kappa-loss-for-keras-tensorflow
# def _cohen_kappa(y_true, y_pred, num_classes, weights=None, metrics_collections=None, updates_collections=None, name=None):
#    kappa, update_op = tf.contrib.metrics.cohen_kappa(y_true, y_pred, num_classes, weights, metrics_collections, updates_collections, name)
#    K.get_session().run(tf.local_variables_initializer())
#    with tf.control_dependencies([update_op]):
#       kappa = tf.identity(kappa)
#    return kappa

# def cohen_kappa_loss(num_classes, weights=None, metrics_collections=None, updates_collections=None, name=None):
#    def cohen_kappa(y_true, y_pred):
#       return -_cohen_kappa(y_true, y_pred, num_classes, weights, metrics_collections, updates_collections, name)
#    return cohen_kappa

In [18]:
encoders = {}
encoders['y'] = OneHotEncoder()
y_train = df_train['median_relevance'].values
y_train = encoders['y'].fit_transform(y_train.reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [19]:
print(x_train.shape)
print(y_train.shape)

(13205, 2304)
(13205, 4)


In [20]:
model = Sequential()

model = Sequential()
model.add(Dense(512, input_dim=x_train.shape[1]))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(128))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

cb_save_model = callbacks.ModelCheckpoint(joinpath('models', 'st-model.h5'),
                                                  monitor='val_acc', save_best_only=True, verbose=2)
cb_early_stopping = callbacks.EarlyStopping(monitor='val_acc', min_delta=0.001, patience=5, verbose=0, mode='auto')

all_callbacks = [cb_save_model, cb_early_stopping]

model.fit(x=x_train, y=y_train, batch_size=32, epochs=100, 
             verbose=1, validation_split=0.1, shuffle=True, 
            #  class_weight=class_weights, 
             callbacks=all_callbacks)

























Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.














Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




















Train on 11884 samples, validate on 1321 samples
Epoch 1/100































Epoch 00001: val_acc improved from -inf to 0.25132, saving model to models/st-model.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.25132 to 0.31188, saving model to models/st-model.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.31188 to 0.44209, saving model to models/st-model.h5
Epoch 4/100

Epoch 00004: val_acc improved from 0.44209 to 0.55413, saving model to models/st-model.h5
Epoch 5/100

Epoch 00005: val_acc improved from 0.55413 to 0.75927, saving model to models/st-model.h5
Epoch 6/100

Epoch 00006: val_acc improved from 0.75927 to 0.79182, saving model to models/st-model.h5
Epoch 7/100

Epoch 00007: val_acc improved from 0.79182 to 0.79561, saving model to models/st-model.h5
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.79561
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.79561
Epoch 10/100

Epoch 00010: val_acc improved from 0.79561 to 0.85087, saving model to models/st-model.h5
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.85087


<keras.callbacks.History at 0x7f939d715c50>

In [21]:
model = load_model('models/st-model.h5')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [22]:
print('Making test predictions')
t_labels = model.predict(x_test)
t_labels = [int(x) for x in encoders['y'].inverse_transform(t_labels)]
print('Done!')

df_test['prediction'] = t_labels

submission_file_path = 'data/submission.csv'
df_test[['id', 'prediction']].to_csv(submission_file_path, index=False)
print('Submission saved!')

Making test predictions
Done!
Submission saved!


In [0]:
# y_test = np.around(y_test)
# y_test

In [0]:
# np.unique(y_test)

In [0]:
# y_test = [1 if x<1 else x for x in y_test]
# y_test = [4 if x>4 else x for x in y_test]
# y_test = [int(x) for x in y_test]
# np.unique(y_test)