<a href="https://colab.research.google.com/github/akhilkapil/Text-Classification-101/blob/main/Text_Classification_04_GloveEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this Notebook we will demontrate text classification model using Glove embeddings with Neural network.

In [None]:
import pandas as pd 
import numpy as np 
import sys 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant 
import zipfile
import os 
import tensorflow as tf

import regex as re 
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
! wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
with zipfile.ZipFile('/content/glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/Glove')

In [None]:
train_df = pd.read_csv('/content/train_2kmZucJ.csv')
test_df = pd.read_csv('/content/test_oJQbWVk.csv')

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

def clean_text(text):
  text = str(text)
  for punc in puncts:
      if punc in text:
          text = text.replace(punc, ' ')
  return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
train_df['tweet'] = train_df['tweet'].apply(lambda x: remove_emoji(x)) 
train_df['tweet'] = train_df['tweet'].apply(lambda x: clean_text(x)) 
train_df['tweet'] = train_df['tweet'].apply(lambda x: re.sub(r'http\S+','',x))
train_df['tweet'] = train_df['tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
train_df['tweet'] = train_df['tweet'].apply(lambda x:' '.join(x.split()))
train_df['tweet'] = train_df['tweet'].apply(lambda x: remove_stopwords(x))
train_df['tweet'] = train_df['tweet'].apply(lambda x: x.lower())



In [None]:
#Preprocessing the test dataset as well
test_df['tweet'] = test_df['tweet'].apply(lambda x: remove_emoji(x)) 
test_df['tweet'] = test_df['tweet'].apply(lambda x: clean_text(x)) 
test_df['tweet'] = test_df['tweet'].apply(lambda x: re.sub(r'http\S+','',x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
test_df['tweet'] = test_df['tweet'].apply(lambda x:' '.join(x.split()))
test_df['tweet'] = test_df['tweet'].apply(lambda x: remove_stopwords(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: x.lower())


In [None]:
GLOVE_DIR = '/content/Glove'

MAX_LENGTH_SEQ = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.20
LABELS_LEN = train_df['label'].nunique()

In [None]:
X = train_df['tweet']
y = train_df['label']

## Loading and Preprocessing 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=2018)

In [None]:
##Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS )
tokenizer.fit_on_texts(train_df['tweet'])
train_sequences =   tokenizer.texts_to_sequences(train_df['tweet'])
test_sequences = tokenizer.texts_to_sequences(test_df['tweet'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 23144 unique tokens.


In [None]:
#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier
#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
train_data = pad_sequences(train_sequences, maxlen=MAX_LENGTH_SEQ)
test_data = pad_sequences(test_sequences, maxlen=MAX_LENGTH_SEQ)
train_labels = to_categorical(np.asarray(train_df['label']))

In [None]:
#split the training data into a training se and validation set 
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * train_data.shape[0])
x_train = train_data[:-num_validation_samples]
y_train = train_labels[:-num_validation_samples]
x_val = train_data[-num_validation_samples:]
y_val = train_labels[-num_validation_samples:]

In [None]:
print('Preparing embedding matrix')

#First build index mapping words in the embedding set
#to their embedding vector 

embedding_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embedding_index))
print(embedding_index["google"])

In [None]:
#Prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from the Glove
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
print(embedding_matrix.shape)
for word, i in word_index.items():
  print(i, word)
  if i > MAX_NUM_WORDS:
    continue
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer = Constant(embedding_matrix),
                            input_length=MAX_LENGTH_SEQ,
                            trainable=False)
print('Preparing of embedding matrix is done')

Preparing of embedding matrix is done



**1D CNN Model with pre-trained embedding¶** 

In [None]:
cnnmodel = Sequential()
cnnmodel.add(embedding_layer)
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(LABELS_LEN, activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
cnnmodel.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_val,y_val))

**LSTM Model with training your own embedding**

In [None]:
rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(2, activation='sigmoid'))
rnnmodel.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training the RNN')

rnnmodel.fit(x_train, y_train,
          batch_size=32,
          epochs=1,
          validation_data=(x_val, y_val))

Training the RNN


<tensorflow.python.keras.callbacks.History at 0x7f8660376668>

**Bidirectional LSTM with 3 Output Layers**

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu', kernel_initializer='uniform'))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=5, validation_data= (x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8524c76588>