In [0]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn

In [3]:
!pip install contractions
!pip install beautifulsoup4

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/52/52/a15f0fb338a462045c7c87a35dbaeda11738c45aa9d2f5c76ac191d6adff/contractions-0.0.17-py2.py3-none-any.whl
Installing collected packages: contractions
Successfully installed contractions-0.0.17


In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import time
import contractions #pip install contractions
from bs4 import BeautifulSoup #pip install beautifulsoup4
import unicodedata
import re

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


In [6]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/movie_reviews.csv.bz2', compression='bz2')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null object
dtypes: object(2)
memory usage: 781.3+ KB


In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df['sentiment'] = [1 if sentiment == 'positive' else 0 for sentiment in df['sentiment'].values]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Creating Datasets for Training, Testing and Validation:

In [23]:
reviews = df['review'].values
labels = df['sentiment'].values

train_reviews = reviews[:30000]
train_labels = labels[:30000]

test_reviews = reviews[30000:40000]
test_labels = labels[30000:40000]

val_reviews = reviews[45000:]
val_labels = labels[45000:]

train_reviews.shape, test_reviews.shape, val_reviews.shape,

((30000,), (10000,), (5000,))

In [0]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    text_ = soup.get_text()
    text_ = re.sub(r'[\r|\n|\r\n]+', '\n', text_)
    return text_


In [0]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [0]:
def expand_contractions(text):
    return contractions.fix(text)

In [0]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
def clean_doc(document):
    
    # strip HTML
    doc = strip_html_tags(document)
    
    # lower case
    doc = doc.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    doc = doc.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    doc = remove_accented_chars(doc)
    
    # expand contractions    
    doc = expand_contractions(doc)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    doc = special_char_pattern.sub(" \\1 ", doc)
    doc = remove_special_characters(doc, remove_digits=True)  
        
    # remove extra whitespace
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()
    
    return doc


clean_corpus = np.vectorize(clean_doc)

## Preprocessing the Text:

In [0]:
train_reviews = clean_corpus(train_reviews)
val_reviews = clean_corpus(val_reviews)
test_reviews = clean_corpus(test_reviews)


## Preparing the Training:

In [0]:
# Training input on the whole training set with no limit on training epochs.
train_input = tf.estimator.inputs.numpy_input_fn({'sentence': train_reviews}, train_labels, batch_size=256, num_epochs=None, shuffle=True)

In [0]:
# Prediction on the whole training set.
predict_train = tf.estimator.inputs.numpy_input_fn({'sentence': train_reviews}, train_labels, shuffle=False)

In [0]:
# Prediction on the whole validation set.
predict_val = tf.estimator.inputs.numpy_input_fn({'sentence': val_reviews}, val_labels, shuffle=False)

In [0]:
# Prediction on the test set.
predict_test = tf.estimator.inputs.numpy_input_fn({'sentence': test_reviews}, test_labels, shuffle=False)

## Loading Embeddings from Universal Sentence Encoder:

In [27]:
embeddings = hub.text_embedding_column(key='sentence', module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", trainable=False)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.
INFO:tensorflow:Downloading https://tfhub.dev/google/universal-sentence-encoder/2: 788.00MB
INFO:tensorflow:Downloaded https://tfhub.dev/google/universal-sentence-encoder/2, Total size: 993.27MB
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.


## Creating the Classifier:

In [28]:
cls = tf.estimator.DNNClassifier(
          hidden_units=[512, 128],
          feature_columns=[embeddings],
          n_classes=2,
          activation_fn=tf.nn.relu,
          dropout=0.2,
          optimizer=tf.train.AdagradOptimizer(learning_rate=0.005))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp0a7vg04h', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb48877d240>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## Training the model:

In [30]:
%%time
tf.logging.set_verbosity(tf.logging.ERROR)

epochs = 1000
step = 100
for s in range(0, epochs+1, step):
    print()
    print('-'*150)
    print('Epoch =', s)
    ini_time = time.time()
    cls.train(input_fn=train_input, steps=step)
    total_time = time.time() - ini_time
    print('Train Time (s):', total_time)
    print('Metrics for Train):', cls.evaluate(input_fn=predict_train))
    print('Metrics for Validation):', cls.evaluate(input_fn=predict_val))


----------------------------------------------------------------------------------------------------
Training for step = 0
Train Time (s): 85.70678925514221
Metrics (Train): {'accuracy': 0.83463335, 'accuracy_baseline': 0.5005, 'auc': 0.9252298, 'auc_precision_recall': 0.925236, 'average_loss': 0.36300963, 'label/mean': 0.5005, 'loss': 46.341656, 'precision': 0.8832139, 'prediction/mean': 0.4462625, 'recall': 0.7716284, 'global_step': 100}
Metrics (Validation): {'accuracy': 0.8336, 'accuracy_baseline': 0.50600004, 'auc': 0.9234706, 'auc_precision_recall': 0.9208938, 'average_loss': 0.36624637, 'label/mean': 0.494, 'loss': 45.780796, 'precision': 0.88164026, 'prediction/mean': 0.443118, 'recall': 0.7659919, 'global_step': 100}

----------------------------------------------------------------------------------------------------
Training for step = 100
Train Time (s): 82.87584829330444
Metrics (Train): {'accuracy': 0.84506667, 'accuracy_baseline': 0.5005, 'auc': 0.9297993, 'auc_precision

## Testing the model:

In [32]:
cls.evaluate(input_fn=predict_test)

{'accuracy': 0.8572,
 'accuracy_baseline': 0.5022,
 'auc': 0.93549216,
 'auc_precision_recall': 0.93376917,
 'average_loss': 0.32665488,
 'global_step': 1100,
 'label/mean': 0.4978,
 'loss': 41.34872,
 'precision': 0.8707185,
 'prediction/mean': 0.47735542,
 'recall': 0.83748496}