In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 18.2MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 47.5MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
 

In [None]:
import numpy as np

import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from keras.utils import np_utils
import contractions
import numpy as np
import re
import tqdm
import unicodedata
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

Version:  2.3.0
Eager mode:  True
Hub version:  0.10.0
GPU is available


#### Load preprocessed data 

In [None]:
import pickle
##load pre-processed data
with open('/content/gdrive/My Drive/upwork/train_content_new.pkl','rb') as f:
  train_content = pickle.load(f)
with open('/content/gdrive/My Drive/upwork/train_rating_new.pkl','rb') as f:
  train_rating = pickle.load(f)
with open('/content/gdrive/My Drive/upwork/test_content_new.pkl','rb') as f:
  test_content = pickle.load(f)
with open('/content/gdrive/My Drive/upwork/test_rating_new.pkl','rb') as f:
  test_rating = pickle.load(f)

Basic Text Pre-processing
We do minimal text pre-processing here since we are using deep learning models and not count-based methods. Steps include the following:



Converting accented characters

*   Fixing contractions
*   Removing special characters
*   Converting accented characters

Note : For some models we don't use any pre-processing like BERT!

In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm.tqdm(docs):
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = contractions.fix(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  
        norm_docs.append(doc)
    return norm_docs

In [None]:
%%time
train_content = pre_process_corpus(train_content)
test_content = pre_process_corpus(test_content)

100%|██████████| 458266/458266 [01:00<00:00, 7564.45it/s]
100%|██████████| 50919/50919 [00:06<00:00, 7513.96it/s]

CPU times: user 1min 6s, sys: 861 ms, total: 1min 7s
Wall time: 1min 7s





### convert target variable to one hot encoding since its a multiclass classification problem

In [None]:
train_rating = np_utils.to_categorical(train_rating)
test_rating = np_utils.to_categorical(test_rating)

### Building input pipeline

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_content, train_rating))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

### Build a NNLM Embedding Layer

In [None]:
model = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(model, output_shape=[128], input_shape=[], 
                           dtype=tf.string, trainable=True)

### Build Model Architecture

In [None]:

model = tf.keras.models.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774       
Total params: 124,676,486
Trainable params: 124,676,486
Non-trainable params: 0
__________________________________________

### load prerained weights if available

In [None]:
epochs = 100
filepath="/content/gdrive/My Drive/upwork/weights/model.h5"
model.load_weights(filepath)

### Define callbacks 

In [None]:

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=filepath,
    save_weights_only=False,
    monitor='val_acc',
    mode='max',
    save_best_only=True)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=3,
                                      restore_best_weights=True,
                                      verbose=1)
    


### Train the model

In [None]:
for epoch in range(epochs):
  model.fit(train_dataset,
          validation_steps = 0.05,
          epochs=5, 
          shuffle=True,
          callbacks=[es,model_checkpoint_callback],
          verbose=1)
  model.save(filepath)

Epoch 1/5








Epoch 2/5








Epoch 3/5








Epoch 4/5








Epoch 5/5








Epoch 1/5








Epoch 2/5
1396/7161 [====>.........................] - ETA: 7:08 - loss: 0.0473 - accuracy: 0.9843

In [None]:
model.save('/content/gdrive/My Drive/upwork/weights/new_model.h5')