In [None]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
import os
import json
import sys
import random
import itertools
from collections import Counter

import pickle
import numpy as np
import pandas as pd

from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words

from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, Flatten, Reshape
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dropout, Activation, concatenate, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers, layers, optimizers, losses, metrics
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn import model_selection
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from IPython.display import SVG 
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
pd.options.display.max_colwidth = 120

In [None]:

def tokenize(sentences, VOCAB_SIZE, filters='!"#%&()*,+-/:;<=>?@[\\]^_`{|}~\t\n', 
             lower=True, split=' ', char_level=False, oov_token="<unk>", verbose=True):
    """Convert the sentences (strings) into sequences of integers and toks.
    When using deafult filter; words maybe include the `'` character 
    
    '0' is a reserved index that won't be assigned to any word
    If oov_token is passed: it will have index 1 and will be added to word_index, index_word dict
    
    Even though converted sequences has only VOCAB_SIZE words but all dicts of tokenizer (word_index, word_counts, word_docs) contains all the words.
    All the words whose idx > VOCAB_SIZE will be treated as <unk> words
    """
    
    # no pruning of words
    if VOCAB_SIZE == 0:
        VOCAB_SIZE = None
    
    tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, filters = filters,
                                                   lower=lower, split=split, char_level=char_level,
                                                   oov_token=oov_token)
    tokenizer.fit_on_texts(sentences)
    idx = tokenizer.texts_to_sequences(sentences)
    tok = tokenizer.sequences_to_texts(idx)

    
    if verbose:
        print("tokenizer details:")
        
        keys1 = ['char_level', 'filters', 'lower',  'split', 'oov_token', 
                 'document_count', 'num_words']
        keys2 = ['index_docs', 'word_docs', 'index_word',  'word_index', 'word_counts']

        for key in keys1:
            print(f"\t{key:<20}: {tokenizer.__getattribute__(key)}")
        print("\n")
        
        for key in keys2:
            print(f"\tlen {key:<20}: {len(tokenizer.__getattribute__(key))}")
        print("\n")
        

    return tokenizer, idx, tok


def build_word_index(tokenizer, VOCAB_SIZE, pad_idx=0, verbose=True):
    """Adds pad token to toknizer and returns word2idx, idx2word till VOCAB_SIZE"""
    
    # we need to add <pad> token to tokenizer.word_index if plan to use inbuilt sequences_to_texts
    # else sequences_to_texts on padded data will replace pad_idx (by default 0) by <unk> rather than <pad>
    tokenizer.word_index.update({'<pad>':pad_idx})
    tokenizer.index_word.update({pad_idx:'<pad>'})
    tokenizer.word_index.update({'<marker>':1})
    tokenizer.index_word.update({1:'<marker>'})


    # since tokenizer.word_index has all words > we get only what's under VOCAB_SIZE
    word2idx = {w:idx for w,idx in tokenizer.word_index.items() if idx < VOCAB_SIZE}
    idx2word = {idx:w for w,idx in word2idx.items()}

    # if our vocab was less than VOCAB_SIZE > we should adjust it
    VOCAB_SIZE = len(word2idx)
    
    if verbose:
        print("len word2idx: ", len(word2idx))
        print("len idx2word: ", len(idx2word))
        print("vocab size: ", VOCAB_SIZE)
        print("")
    
    return tokenizer, word2idx, idx2word
    

def built_embed_matrix(embed, word2idx, VOCAB_SIZE, EMBED_DIM, oov_vec = "mean"):
    """Will build embedding matrix using pre-trained word vector
    Returns embedding_matrix, unk_words (words that aren't found in pre-trained word embeddings)
    
    oov_vec: param decides how we want to handle OOV word that aren't present in pre-trained word vectors
      "mean": average embedding of all words for replacing OOV words
      "norm": randomly initialize each oov token vector with mean, std dev. of all values in pre-trained word embeddings
      
      todo: optimize the oov_vec part.
    """
    
    # stack all pre-trained word embeddings
    emb_all = np.stack(list(embed.values()))
    
    if oov_vec == "mean":
        # average embedding of all words 
        emb_oov = np.mean(emb_all, axis=0)
        print(f"embed_all shape: {emb_all.shape} | embed_mean_vec shape: {emb_oov.shape}")
    
    
    if oov_vec == "rand":
        # mean and std of all values in embed 
        # this can be used to randomly initialize each oov token vector with mean, std dev. of all values in pre-trained word embeddings
        emb_mean, emb_std = emb_all.mean(), emb_all.std()
        emb_oov = np.random.normal(emb_mean, emb_std, (EMBED_DIM))
        
        print(f"embed_all shape: {emb_all.shape}")
        print(f"embed mean: {emb_mean} | embed std: {emb_std} | embed rand_vec shape: {emb_oov.shape}")
    
    # prepare embedding matrix
    print('\nprepare embedding matrix...')

    embedding_matrix = np.zeros((VOCAB_SIZE, EMBED_DIM))
    embed_cnt = 0
    unk_words = []

    for word, idx in word2idx.items():
        embedding_vector = embed.get(word)

        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
            embed_cnt += 1

        else:
            # embedding for <pad> token will be 0's as initialized
            if word != '<pad>':
                if oov_vec == "mean":
                    # average embedding of all words 
                    embedding_matrix[idx] = emb_oov
                
                if oov_vec == "rand":
                    # randomly initialize each oov token vector with mean, std dev. of all values in pre-trained word embeddings
                    emb_oov = np.random.normal(emb_mean, emb_std, (EMBED_DIM))
                    embedding_matrix[idx] = emb_oov
                    
                unk_words.append((word, idx))     

    print(f"\tembedding matrix shape: {embedding_matrix.shape}")
    print(f"\tword embedding found: {embed_cnt}")
    print(f"\tword embedding not found: {len(unk_words)}")
    print("")
    
    return embedding_matrix, unk_words

    

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data = open("/content/gdrive/My Drive/train.csv")
sents = data.readlines()

In [None]:
fin = []
for sent in sents:
  if sent != '\n':
    fin.append(sent)
len(fin)

40

In [None]:
out = [1 for i in range(40)]

In [None]:
lens = []
for sen in fin:
  lens.append(len(sen.split()))

In [None]:
for q in [.5, .6, .8, .9, .95, 1.0]:
    print(f"\tquantile: {q:<5} | {np.quantile(lens, q)}")

	quantile: 0.5   | 42.5
	quantile: 0.6   | 45.0
	quantile: 0.8   | 53.20000000000002
	quantile: 0.9   | 68.80000000000001
	quantile: 0.95  | 79.94999999999995
	quantile: 1.0   | 106.0


In [None]:
c=0
for sent in fin:
  words = len(set(sent.split()))
  c = c+words
c

In [None]:
VOCAB_SIZE = 1500
tokenizer, idx, tok = tokenize(fin, VOCAB_SIZE)

tokenizer details:
	char_level          : False
	filters             : !"#%&()*,+-/:;<=>?@[\]^_`{|}~	

	lower               : True
	split               :  
	oov_token           : <unk>
	document_count      : 40
	num_words           : 1500


	len index_docs          : 451
	len word_docs           : 451
	len index_word          : 452
	len word_index          : 452
	len word_counts         : 451




In [None]:
tok[10]

'for pos tagging and lemmatization we combine genia with its built in occasionally deviant to kenizer and tnt brants 2000 which operates on pre tokenized inputs but in its default models trained on financial news from the penn tree bank. $$$$$ tnt uses second order markov models for part ofspeech tagging.'

In [None]:
tokenizer, word2idx, idx2word = build_word_index(tokenizer, VOCAB_SIZE, pad_idx=0)

len word2idx:  454
len idx2word:  453
vocab size:  454



In [None]:
def convert2seq():
  data_idx = []
  for sent in tok:
    row=[]
    for word in sent.split():
      row.append(word2idx[word])
    data_idx.append(row)
  return data_idx

In [None]:
data_idx = convert2seq()

In [None]:
word2idx['$$$$$']

5

In [None]:
MAX_SEQ_LEN = 90
data_padded = keras.preprocessing.sequence.pad_sequences(data_idx, maxlen=MAX_SEQ_LEN, 
                                                         padding='post', truncating='post', value=0)
data_padded.shape

(40, 90)

In [None]:
EMBED_DIM = 50
embed_path = "/content/gdrive/My Drive/glove.6B/glove.6B.%sd.txt" % EMBED_DIM 
glove_embed = {}
f = open(embed_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    if len(coefs) == 50:
      glove_embed[word] = coefs
f.close()

In [None]:
embedding_matrix, unk_words = built_embed_matrix(glove_embed, word2idx, VOCAB_SIZE, 
                                                 EMBED_DIM, oov_vec='mean')

embed_all shape: (400000, 50) | embed_mean_vec shape: (50,)

prepare embedding matrix...
	embedding matrix shape: (1500, 50)
	word embedding found: 407
	word embedding not found: 46



In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(np.array(data_padded), np.array(out), random_state=10, test_size=0.2, shuffle=True)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((32, 90), (32,), (8, 90), (8,))

In [None]:
NUM_CLASSES = 1
input_ = Input(shape=(MAX_SEQ_LEN,), name="input")

embed_layer = Embedding(input_dim= VOCAB_SIZE, weights=[embedding_matrix], 
                        output_dim= EMBED_DIM, input_length = MAX_SEQ_LEN, 
                        trainable= False, mask_zero=True, name="embed")(input_)

lstm_layer = Bidirectional(LSTM(units= 128, dropout=.25, 
                                recurrent_dropout=.25, kernel_regularizer = regularizers.l2(0.01), return_sequences=True, return_state=True,
                               ), name="lstm")(embed_layer)

dense_layer = Dense(64, activation="relu", name="dense")(lstm_layer)
dense_layer = Dropout(.25)(dense_layer)
out = Dense(NUM_CLASSES, activation= "sigmoid", name="output")(dense_layer) 

model_lstm = Model(input_, out)

model_lstm.compile(optimizer= "adam", 
                   loss= "binary_crossentropy", 
                   metrics= ["accuracy"]) 

model_lstm.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 90)]              0         
_________________________________________________________________
embed (Embedding)            (None, 90, 50)            75000     
_________________________________________________________________
lstm (Bidirectional)         (None, 256)               183296    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
output (Dense)               (None, 1)                 65        
Total params: 274,809
Trainable params: 199,809
Non-trainable params: 75,000
___________________________________________

In [None]:
class_weights = compute_class_weight('balanced', np.unique(train_y), train_y)
class_weights = {i : class_weights[i] for i in range(2)}

In [None]:
BATCH_SIZE = 64
EPOCHS = 20

history = model_lstm.fit(train_x, train_y,
                         batch_size = BATCH_SIZE,
                         epochs = EPOCHS,
                         shuffle = True,
                         class_weight = class_weights,
                         validation_data = (test_x, test_y)
                        )

In [None]:
pred = model_lstm.predict(test_x)
pred = (pred>=.70).astype(float)

count = 0
for i in pred:
  if i == 1:
    count+=1

print(count)
print('\nClassification Report:')
print(classification_report(test_y, pred))

In [None]:
model_lstm.save("model_lstm.h5")

NameError: ignored

In [None]:
model_lstm = load_model('lstm_model.h5')

In [None]:
for layer in model.layers:
        if "lstm" in str(layer):
            weightLSTM = layer.get_weights()
warr,uarr, barr = weightLSTM
warr.shape,uarr.shape,barr.shape

In [None]:
input_ = Input(shape=(MAX_SEQ_LEN,), name="input")
embed_layer = Embedding(input_dim= VOCAB_SIZE, weights=[embedding_matrix], 
                        output_dim= EMBED_DIM, input_length = MAX_SEQ_LEN, 
                        trainable= False, mask_zero=True, name="embed")(input_)
out, h, c = Bidirectional(LSTM(units= 128, dropout=.25, 
                                recurrent_dropout=.25, kernel_regularizer = regularizers.l2(0.01), return_sequences=True, return_state=True,
                               ), name="lstm")(embed_layer)
model1 = Model(input_, [h, c, out])

In [None]:
for layer in model1.layers:
  for layer1 in model_lstm.layers:
    if layer.name == layer1.name:
      layer.set_weights(layer1.get_weights())

In [None]:
from scipy.spatial import distance
test1 = pd.read_csv("/content/gdrive/My Drive/test1.csv")
sentences = {}
list1 = list(df['col1'])
list2 = list(df['col2'])
flist = list1+list2
flist = set(flist)
flist = list(flist)
for sent in flist:
  h_t_keras, c_t_keras, lstm = model1.predict(sent)
  sentences[sent]=h_t_keras

for sent in set(list(df['col1'])):
  inp = sentences[sent]
  opts = list(df[df['col1']==sent].col2)

for opt in opts:
  dict1[opt] = distance.cosine(inp, sentences[opt])

dict1 = {k: v for k, v in sorted(dict1.items(), key=lambda item: item[1])}
print(dict1.keys()[:3])