<a href="https://colab.research.google.com/github/anantha5ani/Coursera-Introduction-to-Python/blob/master/DeepLearningWithPython_Chapter6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 6 - Deep learning for text and sequences

In [0]:
# set tensorflow to version 2
%tensorflow_version 2.x # run this step everytime the kernel is restarted
import tensorflow as tf

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x # run this step everytime the kernel is restarted`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [0]:
# Word-level one-hot encoding (toy example)
import numpy as np

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 10

results = np.zeros(shape=(len(samples),
                          max_length,
                          max(token_index.values()) + 1))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1.

In [0]:
# Character-level on-hot encoding (toy example)
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

characters = string.printable
token_index = dict(zip(range(1, len(characters) + 1), characters))
max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
  for j, character in enumerate(sample):
    index = token_index.get(character)
    results[i, j, index] = 1.

In [0]:
# Using Keras for word-level one-hot encoding
from tensorflow.keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9 unique tokens.


In [0]:
word_index

{'ate': 7,
 'cat': 2,
 'dog': 6,
 'homework': 9,
 'mat': 5,
 'my': 8,
 'on': 4,
 'sat': 3,
 'the': 1}

In [0]:
# Word-level one-hot encoding with hashing trick (toy example)
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
      index = abs(hash(word)) % dimensionality
      results[i, j, index] = 1.

In [0]:
# Instantiating an Embedding layer
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(1000, 64)
### The Embedding layer takes at least two arguments: 
##### the number of possible tokens (here, 1,000: 1 + maximum word index)
##### and the dimensionality of the embeddings(here, 64).

In [0]:
# Loading the IMDB data for use with an Embedding layer
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing

max_features = 10000 # Number of words to consider as features
maxlen = 20 # Cuts off the text after this number of words (among the max_features most common words)

(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words=max_features) # Loads the data as lists of integers

#### Turns the lists of integers into a 2D integer tensor of shape
(samples, maxlen)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [0]:
# Using an Embedding layer and classifier on the IMDB data
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))

model.add(Flatten())

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Download imdb raw dataset to build a model from scratch

In [0]:
### Uncomment blocks below if running for the first time - currently the datasets are already downloaded in place
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
os.chdir('/content/gdrive/My Drive/kaggle/imdb')

In [0]:
### extract zip files into folders
!unzip -q aclImdb.zip -d aclImdb/  #unzip data in aclImdb/

replace aclImdb/aclImdb/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/__MACOSX/aclImdb/._.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/aclImdb/test/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/__MACOSX/aclImdb/test/._.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/aclImdb/test/neg/0_2.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/__MACOSX/aclImdb/test/neg/._0_2.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/aclImdb/test/neg/10000_4.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/__MACOSX/aclImdb/test/neg/._10000_4.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/aclImdb/test/neg/10001_1.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/__MACOSX/aclImdb/test/neg/._10001_1.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/aclImdb/test/neg/10002_3.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace aclImdb/__MACOSX/aclImdb/test/neg

In [0]:
# Processing the labels of the raw IMDB data
import os

imdb_dir = os.path.join(os.getcwd(), 'aclImdb/aclImdb')
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
  dir_name = os.path.join(train_dir, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name, fname))
      texts.append(f.read())
      f.close()
      if label_type == 'neg':
        labels.append(0)
      else:
        labels.append(1)

In [0]:
os.listdir()

['aclImdb.zip', 'aclImdb']

In [0]:
# Tokenizing the text of the raw IMDB data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]