In [None]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
import os
import json
import sys
import random
import itertools
from collections import Counter

import pickle
import numpy as np
import pandas as pd

from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words

from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, Flatten, Reshape
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dropout, Activation, concatenate, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers, layers, optimizers, losses, metrics
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn import model_selection
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from IPython.display import SVG 
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
pd.options.display.max_colwidth = 120

In [None]:

def tokenize(sentences, VOCAB_SIZE, filters='!"#%&()*,+-/:;<=>?@[\\]^_`{|}~\t\n', 
             lower=True, split=' ', char_level=False, oov_token="<unk>", verbose=True):
    """Convert the sentences (strings) into sequences of integers and toks.
    When using deafult filter; words maybe include the `'` character 
    
    '0' is a reserved index that won't be assigned to any word
    If oov_token is passed: it will have index 1 and will be added to word_index, index_word dict
    
    Even though converted sequences has only VOCAB_SIZE words but all dicts of tokenizer (word_index, word_counts, word_docs) contains all the words.
    All the words whose idx > VOCAB_SIZE will be treated as <unk> words
    """
    
    # no pruning of words
    if VOCAB_SIZE == 0:
        VOCAB_SIZE = None
    
    tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, filters = filters,
                                                   lower=lower, split=split, char_level=char_level,
                                                   oov_token=oov_token)
    tokenizer.fit_on_texts(sentences)
    idx = tokenizer.texts_to_sequences(sentences)
    tok = tokenizer.sequences_to_texts(idx)

    
    if verbose:
        print("tokenizer details:")
        
        keys1 = ['char_level', 'filters', 'lower',  'split', 'oov_token', 
                 'document_count', 'num_words']
        keys2 = ['index_docs', 'word_docs', 'index_word',  'word_index', 'word_counts']

        for key in keys1:
            print(f"\t{key:<20}: {tokenizer.__getattribute__(key)}")
        print("\n")
        
        for key in keys2:
            print(f"\tlen {key:<20}: {len(tokenizer.__getattribute__(key))}")
        print("\n")
        

    return tokenizer, idx, tok


def build_word_index(tokenizer, VOCAB_SIZE, pad_idx=0, verbose=True):
    """Adds pad token to toknizer and returns word2idx, idx2word till VOCAB_SIZE"""
    
    # we need to add <pad> token to tokenizer.word_index if plan to use inbuilt sequences_to_texts
    # else sequences_to_texts on padded data will replace pad_idx (by default 0) by <unk> rather than <pad>
    tokenizer.word_index.update({'<pad>':pad_idx})
    tokenizer.index_word.update({pad_idx:'<pad>'})
    tokenizer.word_index.update({'<marker>':1})
    tokenizer.index_word.update({1:'<marker>'})


    # since tokenizer.word_index has all words > we get only what's under VOCAB_SIZE
    word2idx = {w:idx for w,idx in tokenizer.word_index.items() if idx < VOCAB_SIZE}
    idx2word = {idx:w for w,idx in word2idx.items()}

    # if our vocab was less than VOCAB_SIZE > we should adjust it
    VOCAB_SIZE = len(word2idx)
    
    if verbose:
        print("len word2idx: ", len(word2idx))
        print("len idx2word: ", len(idx2word))
        print("vocab size: ", VOCAB_SIZE)
        print("")
    
    return tokenizer, word2idx, idx2word
    

def built_embed_matrix(embed, word2idx, VOCAB_SIZE, EMBED_DIM, oov_vec = "mean"):
    """Will build embedding matrix using pre-trained word vector
    Returns embedding_matrix, unk_words (words that aren't found in pre-trained word embeddings)
    
    oov_vec: param decides how we want to handle OOV word that aren't present in pre-trained word vectors
      "mean": average embedding of all words for replacing OOV words
      "norm": randomly initialize each oov token vector with mean, std dev. of all values in pre-trained word embeddings
      
      todo: optimize the oov_vec part.
    """
    
    # stack all pre-trained word embeddings
    emb_all = np.stack(list(embed.values()))
    
    if oov_vec == "mean":
        # average embedding of all words 
        emb_oov = np.mean(emb_all, axis=0)
        print(f"embed_all shape: {emb_all.shape} | embed_mean_vec shape: {emb_oov.shape}")
    
    
    if oov_vec == "rand":
        # mean and std of all values in embed 
        # this can be used to randomly initialize each oov token vector with mean, std dev. of all values in pre-trained word embeddings
        emb_mean, emb_std = emb_all.mean(), emb_all.std()
        emb_oov = np.random.normal(emb_mean, emb_std, (EMBED_DIM))
        
        print(f"embed_all shape: {emb_all.shape}")
        print(f"embed mean: {emb_mean} | embed std: {emb_std} | embed rand_vec shape: {emb_oov.shape}")
    
    # prepare embedding matrix
    print('\nprepare embedding matrix...')

    embedding_matrix = np.zeros((VOCAB_SIZE, EMBED_DIM))
    embed_cnt = 0
    unk_words = []

    for word, idx in word2idx.items():
        embedding_vector = embed.get(word)

        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
            embed_cnt += 1

        else:
            # embedding for <pad> token will be 0's as initialized
            if word != '<pad>':
                if oov_vec == "mean":
                    # average embedding of all words 
                    embedding_matrix[idx] = emb_oov
                
                if oov_vec == "rand":
                    # randomly initialize each oov token vector with mean, std dev. of all values in pre-trained word embeddings
                    emb_oov = np.random.normal(emb_mean, emb_std, (EMBED_DIM))
                    embedding_matrix[idx] = emb_oov
                    
                unk_words.append((word, idx))     

    print(f"\tembedding matrix shape: {embedding_matrix.shape}")
    print(f"\tword embedding found: {embed_cnt}")
    print(f"\tword embedding not found: {len(unk_words)}")
    print("")
    
    return embedding_matrix, unk_words

    

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data = open("/content/gdrive/My Drive/NLP_Proj/merged.txt")
fin = data.readlines()

In [None]:
from scipy.spatial import distance
df = pd.read_csv("/content/gdrive/My Drive/NLP_Proj/test.csv")
sentences = {}
list1 = list(df['col1'])
list2 = list(df['col2'])
flist = list1+list2
flist = set(flist)
flist = list(flist)

In [None]:
fin = fin + flist[1:]

In [None]:
out= []
for i in range(24661):
  out.append(1)
for i in range(63436-24661):
  out.append(0)

In [None]:
lens = []
for sen in fin:
  lens.append(len(sen.split()))

In [None]:
for q in [.5, .6, .8, .9, .95, 1.0]:
    print(f"\tquantile: {q:<5} | {np.quantile(lens, q)}")

	quantile: 0.5   | 44.0
	quantile: 0.6   | 49.0
	quantile: 0.8   | 61.0
	quantile: 0.9   | 72.0
	quantile: 0.95  | 82.0
	quantile: 1.0   | 1413.0


In [None]:
c=set()
for sent in fin:
  words = set(sent.split())
  c.update(words)
len(c)

101483

In [None]:
VOCAB_SIZE = 1000000
tokenizer, idx, tok = tokenize(fin, VOCAB_SIZE)

tokenizer details:
	char_level          : False
	filters             : !"#%&()*,+-/:;<=>?@[\]^_`{|}~	

	lower               : True
	split               :  
	oov_token           : <unk>
	document_count      : 83489
	num_words           : 1000000


	len index_docs          : 52003
	len word_docs           : 52003
	len index_word          : 52004
	len word_index          : 52004
	len word_counts         : 52003




In [None]:
tokenizer, word2idx, idx2word = build_word_index(tokenizer, VOCAB_SIZE, pad_idx=0)

len word2idx:  52006
len idx2word:  52005
vocab size:  52006



In [None]:
tok1 = tok[63436:]

In [None]:
tok = tok[:63436]

In [None]:
def convert2seq():
  data_idx = []
  for sent in tok:
    row=[]
    for word in sent.split():
      row.append(word2idx[word])
    data_idx.append(row)
  return data_idx

In [None]:
data_idx = convert2seq()

In [None]:
MAX_SEQ_LEN = 80
data_padded = keras.preprocessing.sequence.pad_sequences(data_idx, maxlen=MAX_SEQ_LEN, 
                                                         padding='post', truncating='post', value=0)
data_padded.shape

(63436, 80)

In [None]:
EMBED_DIM = 300
embed_path = "/content/gdrive/My Drive/glove.6B/glove.6B.%sd.txt" % EMBED_DIM 
glove_embed = {}
f = open(embed_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    if len(coefs) == EMBED_DIM:
      glove_embed[word] = coefs
f.close()

In [None]:
embedding_matrix, unk_words = built_embed_matrix(glove_embed, word2idx, VOCAB_SIZE, 
                                                 EMBED_DIM, oov_vec='mean')

embed_all shape: (400000, 300) | embed_mean_vec shape: (300,)

prepare embedding matrix...
	embedding matrix shape: (1000000, 300)
	word embedding found: 24626
	word embedding not found: 27379



In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(np.array(data_padded), np.array(out), random_state=10, test_size=0.2, shuffle=True)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((50748, 80), (50748,), (12688, 80), (12688,))

In [None]:
NUM_CLASSES = 1
input_ = Input(shape=(MAX_SEQ_LEN,), name="input")

embed_layer = Embedding(input_dim= VOCAB_SIZE, weights=[embedding_matrix], 
                        output_dim= EMBED_DIM, input_length = MAX_SEQ_LEN, 
                        trainable= False, mask_zero=True, name="embed")(input_)

lstm_layer = Bidirectional(LSTM(units= 128, return_sequences=True), name="lstm1")(embed_layer)
lstm_layer = Bidirectional(LSTM(units= 128), name="lstm2")(lstm_layer)

dense_layer = Dense(128, activation="relu", name="dense")(lstm_layer)
out = Dense(NUM_CLASSES, activation= "sigmoid", name="output")(dense_layer) 

model_lstm = Model(input_, out)

model_lstm.compile(optimizer= "adam", 
                   loss= "binary_crossentropy", 
                   metrics= ["accuracy"]) 

model_lstm.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 80)]              0         
_________________________________________________________________
embed (Embedding)            (None, 80, 300)           300000000 
_________________________________________________________________
lstm1 (Bidirectional)        (None, 80, 256)           439296    
_________________________________________________________________
lstm2 (Bidirectional)        (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 300,866,561
Trainable params: 866,561
Non-trainable params: 300,000,000
__________________________________

In [None]:
class_weights = compute_class_weight('balanced', np.unique(train_y), train_y)
class_weights = {i : class_weights[i] for i in range(2)}

In [None]:
BATCH_SIZE = 64
EPOCHS = 20

history = model_lstm.fit(train_x, train_y,
                         batch_size = BATCH_SIZE,
                         epochs = EPOCHS,
                         shuffle = True,
                         class_weight = class_weights,
                         validation_data = (test_x, test_y)
                        )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20

In [None]:
pred = model_lstm.predict(test_x)
pred = (pred>=.45).astype(float)

count = 0
for i in pred:
  if i == 1:
    count+=1

print(count)
print('\nClassification Report:')
print(classification_report(test_y, pred))

3077

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.84      0.75      7862
           1       0.58      0.37      0.45      4826

    accuracy                           0.66     12688
   macro avg       0.63      0.60      0.60     12688
weighted avg       0.64      0.66      0.64     12688



In [None]:
model_lstm.save("model_lstm.h5")
model=model_lstm

In [None]:
model = load_model('lstm_model.h5')

NameError: ignored

In [None]:
input_ = Input(shape=(MAX_SEQ_LEN,), name="input")

embed_layer = Embedding(input_dim= VOCAB_SIZE, weights=[embedding_matrix], 
                        output_dim= EMBED_DIM, input_length = MAX_SEQ_LEN, 
                        trainable= False, mask_zero=True, name="embed")(input_)

out = Bidirectional(LSTM(units= 64, dropout=.25, 
                                recurrent_dropout=.25, kernel_regularizer = regularizers.l2(0.01)
                               ), name="lstm")(embed_layer)
'''out = Bidirectional(LSTM(units= 128, dropout=.25, 
                                recurrent_dropout=.25, kernel_regularizer = regularizers.l2(0.01)
                               ), name="lstm1")(lstm_layer)
'''
model1 = Model(input_, out)

In [None]:
model1.summary()

Model: "functional_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 80)]              0         
_________________________________________________________________
embed (Embedding)            (None, 80, 50)            50000000  
_________________________________________________________________
lstm (Bidirectional)         (None, 128)               58880     
Total params: 50,058,880
Trainable params: 58,880
Non-trainable params: 50,000,000
_________________________________________________________________


In [None]:
for layer in model1.layers:
  for layer1 in model_lstm.layers: 
    if layer.name == layer1.name:
      layer.set_weights(layer1.get_weights())

In [None]:
model1.summary()

Model: "functional_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 80)]              0         
_________________________________________________________________
embed (Embedding)            (None, 80, 50)            50000000  
_________________________________________________________________
lstm (Bidirectional)         (None, 128)               58880     
Total params: 50,058,880
Trainable params: 58,880
Non-trainable params: 50,000,000
_________________________________________________________________


In [None]:
data_idx = []
for sent in tok1:
  row=[]
  for word in sent.split():
    word=word.lower()
    if word2idx[word]: 
      row.append(word2idx[word])
    else:
      row.append(word2idx['<unk>'])
  data_idx.append(row)

In [None]:
MAX_SEQ_LEN = 80
data_padded = keras.preprocessing.sequence.pad_sequences(data_idx, maxlen=MAX_SEQ_LEN, 
                                                         padding='post', truncating='post', value=0)
data_padded.shape

(20053, 80)

In [None]:
data_padded[0]

array([   13,  1956,    14,     5,   824,   815,    10,  3849,     6,
           5,  3466,   815,   140,   135,    10,     5,   604,   824,
       12495,     9,   757,   444,     7,     2,  3466,  4778,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [None]:
flist[1]

'We say that a PCFG derivation is isomorphic to a STSG derivation if there is a corresponding PCFG subderivation for every step in the STSG derivation.'

In [None]:
word2idx['we']
model1.predict(data_padded[ind])

array([[-0.01414258,  0.03027619,  0.05460392, ...,  0.0155709 ,
         0.01759867, -0.03623072],
       [-0.01773036,  0.02369586,  0.04547004, ...,  0.01808211,
         0.01671089, -0.03453201],
       [-0.01850735,  0.02252924,  0.04583006, ...,  0.01602512,
         0.01719457, -0.03711054],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [None]:
sentences={}
for ind in range(len(data_padded)):
  h_t_keras = model1.predict(data_padded[ind])
  sentences[flist[1+ind]]=h_t_keras
print('Predicted')

Predicted


In [None]:
sentences['Identifying similar pieces of text has many applications (e.g., summarization, information retrieval, text clustering).'][0]

array([-0.00405581,  0.00867388,  0.0266684 , -0.00584137, -0.01624213,
       -0.00910209,  0.01397985, -0.00964349, -0.01093441, -0.01488451,
        0.03511903,  0.0286076 , -0.01481849,  0.01690504, -0.00541163,
       -0.03090755,  0.02775238, -0.01677078,  0.00384965,  0.01111945,
       -0.01154881, -0.0112345 , -0.00628583,  0.01939121, -0.03387162,
       -0.03386471,  0.00667189, -0.00263849,  0.00429086, -0.00268894,
        0.00079686,  0.00136597,  0.02131243,  0.01572224, -0.00833323,
        0.01279465, -0.01036651, -0.02315241,  0.00676249, -0.00045686,
       -0.00098436,  0.01811864, -0.01338414, -0.00345931, -0.02099242,
       -0.02590306,  0.02940363,  0.00798768, -0.05584986, -0.05106822,
       -0.01502245, -0.00098734,  0.01301523,  0.00718705, -0.01810588,
       -0.0210581 ,  0.01286295,  0.00777822, -0.0036751 , -0.0382782 ,
       -0.00277214,  0.03511423, -0.03962625,  0.0017584 ,  0.00495988,
        0.00233423, -0.00306133,  0.0057816 , -0.01654095, -0.01

In [None]:
for sent in list(set(list(df['col1'])))[1:5]:
  print(sent)
  inp = sentences[sent][0]
  opts = list(df[df['col1']==sent].col2)
  act = list(df[(df['col1']==sent) & (df['col3']==1)].col2)
  print(act)
  dict1={}
  for opt in opts:
    dict1[opt] = distance.cosine(inp, sentences[opt][0])

  dict1 = {k: v for k, v in sorted(dict1.items(), key=lambda item: item[1])}
  print(list(dict1.keys())[:3])

Thomas et al 2006 address the same problem of determining support and opposition as applied to congressional floor-debates.
['We investigate whether one can determine from the transcripts of U.S. Congressional floor debates whether the speeches represent support of or opposition to proposed legislation.']
['However, if we assume that most speakers do not change their positions in the course of a discussion, we can conclude that all comments made by the same speaker must receive the same label.', 'However, this comes at the cost of greatly reducing agreement accuracy (development: 64.38%; test: 66.18%) due to lowered recall levels.', 'However, it is interesting to consider whether we really need to consider relationships specifically between speech segments themselves, or whether it suffices to simply consider relationships between the speakers of the speech segments.']
We experiment with all the standard data sets, namely, Senseval 2 (SV2) (M. Palmer and Dang, 2001), Senseval 3 (SV3) (

In [None]:
sentences['They were also free to use the gold-standard data to train their own models for the various layers of annotation, if they judged that those would either provide more accurate predictions or alternative predictions for use as multiple views, or wished to use a lattice of predictions.'][0][0][0]

array([ 1.21708214e-02, -1.12526584e-02,  8.03518761e-03, -6.32692734e-03,
       -1.36157200e-02,  4.07527424e-02, -3.00485883e-02, -2.64439266e-02,
        5.73045686e-02, -2.71533784e-02, -6.89985137e-03,  3.14300023e-02,
        4.35669050e-02, -1.49543840e-03,  5.70946792e-03, -1.06573701e-02,
       -1.20032942e-02, -1.19712240e-04,  2.24856101e-02, -1.09190755e-02,
       -3.85896070e-03, -4.99813780e-02,  5.94684109e-03, -8.38134717e-03,
        8.38036388e-02,  2.39668265e-02,  1.27936285e-02, -6.05915859e-03,
       -5.16921356e-02, -1.16998178e-03, -3.45869293e-03,  3.24087627e-02,
       -5.59058264e-02, -8.99258070e-03, -8.56186450e-03,  8.20480958e-02,
        1.76715124e-02, -1.39234494e-02,  1.06857279e-02,  1.11873597e-02,
       -5.69639196e-05,  4.35248762e-03,  5.97330183e-03,  1.32748643e-02,
        3.27650481e-03, -2.38264985e-02, -6.73057605e-03,  2.00109687e-02,
        6.97190547e-03, -6.09967485e-02,  2.41252165e-02,  1.37784611e-02,
       -4.70085815e-03, -