<a href="https://colab.research.google.com/github/anshupandey/Natural_language_Processing/blob/master/Transfer_Learning_NLP_BERT_L2_H512_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install tensorflow-text --quiet

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import models,layers
import tensorflow_text as text

In [5]:
# initializing input layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)

# initializing preprocessing layer using a pretrained preprocessor
prep = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3")
encoder_inputs = prep(text_input)

# initializing the model layer - pretrained BERT model
encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-12-h-512-a-8/versions/2",
    trainable=True)

# initializng the output layer
outputs = encoder(encoder_inputs)

# pooled output layer, sentence --> vector
pooled_output = outputs["pooled_output"]      # [batch_size, 128].

# sequence output layer, word---> vector
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 128].

In [6]:

# initializng a model for sent2vec
embedding_model = tf.keras.Model(text_input, pooled_output)
sentences = tf.constant(["Hello world from Python"])

print(embedding_model(sentences).shape)

(1, 512)


In [7]:
# initializng a model for word2vec
embedding_model2 = tf.keras.Model(text_input, sequence_output)
sentences = tf.constant(["My name is anshu here, who are you"])
print(embedding_model2(sentences).shape)

(1, 128, 512)


In [8]:
sentences.shape

TensorShape([1])

In [9]:
doc1 = ["India"]
doc2 = ["I love coding"]
doc3 = ["I love python porgamming and Data Science. Machine Learning is amazing."]
data = [doc1,doc2,doc3]
for doc in data:
  doc = tf.constant(doc)
  print(embedding_model(doc).shape)

(1, 512)
(1, 512)
(1, 512)


In [10]:
for doc in data:
  doc = tf.constant(doc)
  print(embedding_model2(doc).shape)

(1, 128, 512)
(1, 128, 512)
(1, 128, 512)


# Text Classification

In [9]:
# Text Classification

In [11]:
!wget -q https://www.dropbox.com/s/p1z32pkhs9j7cv3/test_data.txt
!wget -q https://www.dropbox.com/s/ed3qcu1231e8ubx/train_5500.txt

In [12]:
train_data = open("train_5500.txt").readlines()
test_data = open("test_data.txt").readlines()

In [13]:
len(train_data)

5452

In [14]:
len(test_data)

500

In [16]:
train_data[:5]

['DESC:manner How did serfdom develop in and then leave Russia ?\n',
 'ENTY:cremat What films featured the character Popeye Doyle ?\n',
 "DESC:manner How can I find a list of celebrities ' real names ?\n",
 'ENTY:animal What fowl grabs the spotlight after the Chinese Year of the Monkey ?\n',
 'ABBR:exp What is the full form of .com ?\n']

In [17]:
def prepare_data(data):
  x = []
  y = []
  for doc in data:
    docs = doc.split(" ")
    y.append(docs[0].split(":")[0])
    x.append(" ".join(docs[1:]).strip())
  return x,y


In [18]:
xtrain,ytrain = prepare_data(train_data)
xtest,ytest = prepare_data(test_data)

In [19]:
import pandas as pd

In [20]:
ytrain = pd.DataFrame(ytrain)
ytrain = pd.get_dummies(ytrain)
ytrain.head()

Unnamed: 0,0_ABBR,0_DESC,0_ENTY,0_HUM,0_LOC,0_NUM
0,0,1,0,0,0,0
1,0,0,1,0,0,0
2,0,1,0,0,0,0
3,0,0,1,0,0,0
4,1,0,0,0,0,0


In [21]:
ytest = pd.DataFrame(ytest)
ytest = pd.get_dummies(ytest)
ytest.head()

Unnamed: 0,0_ABBR,0_DESC,0_ENTY,0_HUM,0_LOC,0_NUM
0,0,0,0,0,0,1
1,0,0,0,0,1,0
2,0,0,0,1,0,0
3,0,1,0,0,0,0
4,0,0,0,0,0,1


In [22]:
categories = list(ytrain.columns)
categories = [i[2:] for i in categories]
categories

['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']

### Text Classification using Transfer learning

In [23]:
# create a function which will accept a text doc and return the corresponding vector
def sent2vec(doc):
  vector = embedding_model(tf.squeeze(tf.cast(doc,tf.string),axis=1))
  return vector

In [24]:
from tensorflow.keras import models,layers

In [None]:
input_layer = layers.Input(shape=(1,),dtype='string')
embedding_layer = layers.Lambda(sent2vec,output_shape=(512,))(input_layer)
dense1 = layers.Dense(256,activation='relu')(embedding_layer)
output_layer = layers.Dense(6,activation='softmax')(dense1)

model = models.Model(inputs=[input_layer],outputs=output_layer)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [26]:
import numpy as np
xtrain = np.array(xtrain).reshape(-1,1)
xtest = np.array(xtest).reshape(-1,1)
print(xtrain.shape)
print(xtest.shape)

(5452, 1)
(500, 1)


In [27]:
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 lambda (Lambda)             (None, 512)               0         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dense_1 (Dense)             (None, 6)                 1542      
                                                                 
Total params: 132870 (519.02 KB)
Trainable params: 132870 (519.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(xtrain,ytrain,epochs=2,batch_size=64,validation_data=(xtest,ytest))

Epoch 1/2
Epoch 2/2
14/86 [===>..........................] - ETA: 18:35 - loss: 1.0105 - accuracy: 0.6295

In [None]:
new_doc = ["Who is president of United States of America?","What is price for Harley Davidson?"]
new_doc = np.array(new_doc).reshape(-1,1)
output = model.predict(new_doc)

In [None]:
output2 = [np.argmax(i) for i in output]
output2 = [categories[i] for i in output2]
output2