In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/MUStARD

/content/drive/.shortcut-targets-by-id/1l5Gm4knVqJokdeDfo6y-WQxqu7UDewFr/MUStARD


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## LSTM

In [3]:
import tensorflow as tf
import keras
import keras.backend as K

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Concatenate, Conv2D, Flatten, Dense, Embedding, LSTM
from keras.models import Model

from sklearn.feature_extraction.text import CountVectorizer

In [4]:
import pandas as pd
text_data = pd.read_json('/content/drive/MyDrive/MUStARD/data/sarcasm_data.json')

In [5]:
text_data = text_data.transpose()

In [6]:
text_data.shape

(690, 6)

In [None]:
text_data[text_data['sarcasm']==True].count()

utterance           345
speaker             345
context             345
context_speakers    345
show                345
sarcasm             345
dtype: int64

In [7]:
cols = list(text_data.columns)
cols.remove('sarcasm')
X = text_data.loc[:,cols]
Y = text_data.loc[:,'sarcasm']
Y = Y.astype(int)

In [None]:
from sent2vec.vectorizer import Vectorizer

# sentences = [
#     "This is an awesome book to learn NLP.",
#     "DistilBERT is an amazing NLP model.",
#     "We can interchangeably use embedding, encoding, or vectorizing.",
# ]
vectorizer = Vectorizer()
vectorizer.bert(contexts)
vectors = vectorizer.vectors

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [None]:
contexts.shape

(690,)

In [None]:
contexts = vectors

In [94]:
import numpy as np
contexts = []
for context in X['context']:
  contexts.append(''.join(context))
contexts = np.array(contexts)

In [117]:
speaker_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

speaker_feat.fit(X['speaker'])
output = speaker_feat.transform(X['speaker'])
speakers_df = pd.DataFrame.sparse.from_spmatrix(output, columns=speaker_feat.get_feature_names())

In [120]:
show_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

show_feat.fit(X['show'])
output = show_feat.transform(X['show'])
show_df = pd.DataFrame.sparse.from_spmatrix(output, columns=show_feat.get_feature_names())

In [121]:
show_vectors = show_df.to_numpy()

In [122]:
show_vectors[0]

array([1, 0, 0, 0], dtype=int64)

In [124]:
def join_context(l):
    return " ".join(l)

X['joint_context_speakers'] = X['context_speakers'].apply(join_context)
# X['joint_context_speakers'] = X[['joint_context_speakers','speaker']].apply(lambda x: ' '.join(x),axis=1)

In [125]:
context_speaker_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

context_speaker_feat.fit(X['joint_context_speakers'])
output = context_speaker_feat.transform(X['joint_context_speakers'])
context_speakers_df = pd.DataFrame.sparse.from_spmatrix(output, columns=context_speaker_feat.get_feature_names())

In [126]:
context_speakers_vectors = context_speakers_df.to_numpy()

In [123]:
speaker_vectors = speakers_df.to_numpy()

In [None]:
!pip install sent2vec

Collecting sent2vec
  Downloading https://files.pythonhosted.org/packages/4e/c6/1f57065edbc772d9529e4a5f75cb812f29bcc2bf59b8e4c34c8ecfd83fe3/sent2vec-0.2.0-py3-none-any.whl
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 4.2MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 43.3MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 48.2MB/s 
Collecting sacremose

In [130]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import numpy

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, Y)
reports = []
p = []
for train_index,test_index in skf.split(X,Y):
  # print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_val = X.iloc[train_index,:], X.iloc[test_index,:]
  y_train, y_val = Y.iloc[train_index], Y.iloc[test_index]
  t = Tokenizer()
  t.fit_on_texts(X_train['utterance'])
  t.fit_on_texts(contexts[train_index])
  # print(len(t.word_index))
  X_train_seq = t.texts_to_sequences(X_train['utterance'])
  X_train_seq_c = t.texts_to_sequences(contexts[train_index])
  X_val_seq = t.texts_to_sequences(X_val['utterance'])
  X_val_seq_c = t.texts_to_sequences(contexts[test_index])

  y_train = y_train.astype(int)
  y_val = y_val.astype(int)

  arr=[]
  arr_c = []

  for vec in X_train_seq:
    arr.append(len(vec))

  # for vec in X_train_seq_c:
  #   arr_c.append(len(vec))

  max_len = max(arr)
  # print(max_len)
  # print(max(arr_c))
  X_train_seq = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
  X_train_seq_c = pad_sequences(X_train_seq_c,maxlen=50,padding='post')
  X_val_seq = pad_sequences(X_val_seq,maxlen=max_len,padding='post')
  X_val_seq_c = pad_sequences(X_val_seq_c,maxlen=50,padding='post')

  embedding_matrix = get_emb_matrix(t)
  # embedding_matrices.append(embedding_matrix)

  # embedding_matrix = embedding_matrices[fold]
  model,inter_output = get_model()
  # model.fit([X_train_seq,contexts[train_index],speaker_vectors[train_index]],y_train.values.reshape(-1,1), epochs = 100,
  #           batch_size=32)

  model.fit([X_train_seq,X_train_seq_c,speaker_vectors[train_index],context_speakers_vectors[train_index],show_vectors[train_index]],y_train.values.reshape(-1,1), epochs = 25,
            batch_size=32)
  # svm_features = model.layers[5].output
  # print(svm_features)
  # svm_features = svm_features.tolist()

  # feature_arr = list(svm_features)
  # print(feature_arr)

  # SVC.fit(svm_features,y_train)

  outs = inter_output.predict([X_train_seq,X_train_seq_c,speaker_vectors[train_index],context_speakers_vectors[train_index],show_vectors[train_index]])
  outs2 = model.predict([X_train_seq,X_train_seq_c,speaker_vectors[train_index],context_speakers_vectors[train_index],show_vectors[train_index]])

  svm_features = outs

  clf = SVC()
  clf.fit(svm_features,y_train)
  # print(outs2.shape)
  # print(outs.shape)
  # print(type(outs))
  # print(X_val_seq.shape)
  # print(vectors[test_index])
  # print(speaker_vectors[test_index])
  predictions = predict(inter_output,X_val_seq,X_val_seq_c,speaker_vectors[test_index],context_speakers_vectors[test_index],show_vectors[test_index],clf)
  p.append(predictions)

  reports.append(classification_report(y_val,predictions))


(None, 18227)
(None, 11776)
Tensor("flatten_116/Reshape:0", shape=(None, 11776), dtype=float32)
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
here
(None, 23091)
(None, 16640)
Tensor("flatten_118/Reshape:0", shape=(None, 16640), dtype=float32)
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
here
(None, 23091)
(None, 16640)
Tensor("flatten_120/Reshape:0", shape=(None, 16640), dtype=float32)
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/

In [19]:
def get_emb_matrix(t):
  import numpy as np
  embeddings_index = {}
  f = open('/content/drive/MyDrive/MUStARD/data/glove.840B.300d.txt')
  for line in f:
      values = line.split()
      word = values[0]
      if(word in t.word_index):
        st=1
        if(len(values)>301):
          st += len(values) - 301
          coefs = np.asarray(values[st:], dtype='float32')
          embeddings_index[word] = coefs
        else:
          coefs = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = coefs
  f.close()

  print('Found %s word vectors.' % len(embeddings_index))

  not_present_list = []
  vocab_size = len(t.word_index) + 1
  print('Loaded %s word vectors.' % len(embeddings_index))
  embedding_matrix = np.zeros((vocab_size, len(embeddings_index['no'])))
  for word, i in t.word_index.items():
      if word in embeddings_index.keys():
          embedding_vector = embeddings_index.get(word)
      else:
          not_present_list.append(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
      else:
          embedding_matrix[i] = np.zeros(300)

  return embedding_matrix

In [129]:
def predict(model,X_val_seq,vectors,vectors2,vectors3,vectors4,clf):
  import numpy as np
  from sklearn.metrics import precision_score,recall_score,f1_score
  print("here")
  # print(vectors.all())  
  probs = model.predict([X_val_seq,vectors,vectors2,vectors3,vectors4])

  predicted = clf.predict(probs)
  # print("here2")
  # predicted = []
  # for p in probs:
  #   if(p>=0.5):
  #     predicted.append(1)
  #   else:
  #     predicted.append(0)

  return np.reshape(predicted,(-1,1))

In [131]:
for r in reports:
  print(r)

              precision    recall  f1-score   support

           0       0.45      0.38      0.41        69
           1       0.46      0.54      0.50        69

    accuracy                           0.46       138
   macro avg       0.46      0.46      0.45       138
weighted avg       0.46      0.46      0.45       138

              precision    recall  f1-score   support

           0       0.59      0.59      0.59        69
           1       0.59      0.58      0.58        69

    accuracy                           0.59       138
   macro avg       0.59      0.59      0.59       138
weighted avg       0.59      0.59      0.59       138

              precision    recall  f1-score   support

           0       0.65      0.74      0.69        69
           1       0.69      0.59      0.64        69

    accuracy                           0.67       138
   macro avg       0.67      0.67      0.66       138
weighted avg       0.67      0.67      0.66       138

              preci

In [24]:
from keras.regularizers import l2
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, concatenate, Bidirectional
from keras.models import Model

from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import Concatenate
from keras.layers.core import Lambda, Flatten, Dense
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Flatten, GlobalMaxPool2D, GlobalAvgPool2D, Concatenate, Multiply, Dropout, Subtract, Add, Conv2D

In [128]:
def get_model():
  input_1 = Input(shape=(X_train_seq.shape[1],))
  input_2 = Input(shape=(X_train_seq_c.shape[1],))
  input_3 = Input(shape=(speaker_vectors.shape[1],))
  input_4 = Input(shape=(context_speakers_vectors.shape[1],))
  input_5 = Input(shape=(show_vectors.shape[1],))
  # input_5 = Input(shape=(200,))

  common_embed = Embedding(name="synopsis_embedd",input_dim =len(t.word_index)+1, 
                        output_dim=300,weights=[embedding_matrix], 
                        input_length=X_train_seq.shape[1],trainable=False) 


  lstm_1 = common_embed(input_1)
  lstm_2 = common_embed(input_2)

  common_lstm = LSTM(128,return_sequences=True, activation="relu")

  bidir = Bidirectional(common_lstm)
  vector_1 = bidir(lstm_1)
  vector_1 = Flatten()(vector_1)

  # vector_2 = Flatten()(lstm_2)
  vector_2 = common_lstm(lstm_2)
  vector_2 = Flatten()(vector_2)

  conc = Concatenate(axis=-1)([vector_1,vector_2])
  conc = Concatenate(axis=-1)([conc,input_3,input_4,input_5])
  print(conc.shape)
  print(vector_1.shape)
  print(vector_1)
  # x3 = Subtract()([vector_1, vector_2])
  # x3 = Multiply()([x3, x3])

  # x1_ = Multiply()([vector_1, vector_1])
  # x2_ = Multiply()([vector_2, vector_2])
  # x4 = Subtract()([x1_, x2_])
      
      #https://stackoverflow.com/a/51003359/10650182
  # x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([vector_1, vector_2])
      
  # conc = Concatenate(axis=-1)([x5,x4, x3])
  # conc = Dropout(0.25)(conc)
  # x1 = Dense(8000,activation='relu')(conc)
  # x1 = Dropout(0.1)(x1)
  x1 = Dense(1000, activation="relu")(conc)
  x = Dense(100, activation="relu", name='conc_layer')(x1)
  # x = Dropout(0.01)(x)
  out = Dense(1, activation="sigmoid", name = 'out')(x)
  model = Model([input_1,input_2,input_3,input_4,input_5], [out])

  inter_output_model = keras.Model(model.input, model.get_layer(name='conc_layer').output) 
                                    
  model.compile(loss=["binary_crossentropy"], metrics=['acc',tf.keras.metrics.Precision(),tf.keras.metrics.Recall()], optimizer=Adam(0.00001))
  return model, inter_output_model

In [None]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 46)]         0                                            
__________________________________________________________________________________________________
synopsis_embedd (Embedding)     (None, 46, 300)      474300      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 46, 128)      186880      synopsis_embedd[0][0]            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 5888)         0           bidirectional[0][0]              
_______________________________________________________________________________________

In [111]:
model.save('/content/drive/MyDrive/MUStARD')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/MUStARD/assets
