In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets

# Task1: Sentiment Analysis

## Importing the dataset

In [None]:
# SST-2

from datasets import load_dataset
# df_train = pd.DataFrame(load_dataset('glue', 'sst2', split='train')).fillna('')
# df_test = pd.DataFrame(load_dataset('glue', 'sst2', split='test')).fillna('')
# df_valid = pd.DataFrame(load_dataset('glue', 'sst2', split='validation')).fillna('')

df_train = pd.DataFrame(load_dataset('glue', 'cola', split='train')).fillna('')
df_test = pd.DataFrame(load_dataset('glue', 'cola', split='test')).fillna('')
df_valid = pd.DataFrame(load_dataset('glue', 'cola', split='validation')).fillna('')
y_train = df_train['label']
y_valid = df_valid['label']
y_test = df_test['label']
df_train = df_train['sentence']
df_valid = df_valid['sentence']
df_test = df_test['sentence']
df_train.head()

In [None]:
# Extended dataset loading
dfe = pd.read_csv('../input/kdextended/extended-dataset/sst2_extended.tsv', sep='\t')
df_train = dfe['sentence']
y_train = dfe['label']

In [None]:
# Extended dataset splitting
# from sklearn.model_selection import train_test_split

# df_train, df_test, y_train, y_test = train_test_split(df_train, y_train,
#                                    random_state=104, 
#                                    test_size=0.2, 
#                                    shuffle=True)

In [None]:
y_test.value_counts()

## Building deep learning model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_words=10000
tokenizer=Tokenizer(max_words)
tokenizer.fit_on_texts(df_train)
sequence_train=tokenizer.texts_to_sequences(df_train)
sequence_valid=tokenizer.texts_to_sequences(df_valid)
sequence_test=tokenizer.texts_to_sequences(df_test)

In [None]:
word2vec=tokenizer.word_index
V=len(word2vec)
print('dataset has %s number of independent tokens' %V)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
data_train=pad_sequences(sequence_train)
data_train.shape

In [None]:
T=data_train.shape[1]
data_valid=pad_sequences(sequence_valid,maxlen=T)
data_test=pad_sequences(sequence_test,maxlen=T)
data_test.shape

In [None]:
from tensorflow.keras.layers import Input,Conv1D,MaxPooling1D,Dense,GlobalMaxPooling1D,Embedding,Bidirectional,LSTM,Dropout
from tensorflow.keras.models import Model, Sequential, model_from_json
from tensorflow.keras import utils

In [None]:
D=20
i=Input((T,))
x=Embedding(V+1,D)(i)
x=Conv1D(32,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(64,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=GlobalMaxPooling1D()(x)
x=Dense(1,activation='sigmoid')(x)
model=Model(i,x)
model.summary()

## Training the model (CNN)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
cnn_senti=model.fit(data_train,y_train,validation_data=(data_test,y_test),epochs=10,batch_size=100)

## Training the model (BiLSTM)

In [None]:
model = Sequential()
D=20
model.add(Embedding(V+1, D, input_length=T))
lstm_out = 64
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dropout(.1, input_shape=(64,)))
model.add(Dense(10, activation='relu'))
model.add(Dropout(.1, input_shape=(10,)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='RMSProp', metrics=['accuracy'])
model.summary()
# utils.plot_model(model, show_shapes=False, expand_nested=True, to_file='sentiment.png')

In [None]:
training = model.fit(data_train,y_train,epochs=10,batch_size=128)


## Evaluation

In [None]:
from matplotlib import rc
import matplotlib.pyplot as plt

rc('font',**{'family':'serif','serif':['Palatino']})

def plot_training(training, filename, ymin=0.5, valid=True):
    plt.figure(figsize=(16, 5), dpi=300)
    plt.subplot(1,2,1)
    plt.plot(training.history['accuracy'], label='accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    if valid:
        plt.plot(training.history['val_accuracy'], color='red', linestyle='dashed')
        plt.legend(['train', 'validation'])
    plt.ylim([ymin, 1])

    plt.subplot(1,2,2)
    plt.plot(training.history['loss'])
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    if valid:
        plt.plot(training.history['val_loss'], color='red', linestyle='dashed')
        plt.legend(['train', 'validation'])
    plt.savefig(filename, bbox='tight')
    plt.show()

In [None]:
plot_training(training, 'cola.pdf', ymin=0.7, valid=False)

In [None]:
model.evaluate(data_train, y_train, batch_size = 1024, use_multiprocessing=True, workers = -1)

In [None]:
model.evaluate(data_test, y_test, batch_size = 1024, use_multiprocessing=True, workers = -1)

In [None]:
model.evaluate(data_valid, y_valid, batch_size = 1024, use_multiprocessing=True, workers = -1)

In [None]:
y_pred = model.predict(data_test, batch_size=1024, verbose= 1)
y_pred = np.round(y_pred)
pred = pd.DataFrame(y_pred)
pred['index'] = pred.index
c = pred.columns
pred[[c[0], c[1]]] = pred[[c[1], c[0]]]
pred.columns = ['index', 'prediction']
pred.to_csv("COLA.tsv", sep='\t', index=False)

# Task2: Semantic Textual Similarity

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
sts_train = pd.DataFrame(load_dataset('glue', 'stsb', split='train')).fillna('')
sts_eval = pd.DataFrame(load_dataset('glue', 'stsb', split='validation')).fillna('')
sts_test = pd.DataFrame(load_dataset('glue', 'stsb', split='test')).fillna('')

In [None]:
# Extended dataset loading
dfe = pd.read_csv('../input/kdextended/extended-dataset/stsb_extended.tsv', sep='\t')
sts_train = dfe[['sentence1', 'sentence2']]
y_train = dfe['label']

In [None]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(dataset):
    data = dataset.copy()
    data[['sentence1', 'sentence2']] = dataset[['sentence1', 'sentence2']].apply(lambda x: x.astype(str).str.lower())  
    data['sentence1'] = data.apply(lambda row: nltk.word_tokenize(row['sentence1']), axis=1).apply(lambda x: [item for item in x if item not in stopwords.words('english')])
    data['sentence2'] = data.apply(lambda row: nltk.word_tokenize(row['sentence2']), axis=1).apply(lambda x: [item for item in x if item not in stopwords.words('english')])
    return data

In [None]:
sts_train_stop = remove_stopwords(sts_train)
sts_test_stop = remove_stopwords(sts_test)
sts_eval_stop = remove_stopwords(sts_eval)

In [None]:
def get_document_frequency(df):
    document_frequency_dict = {}
    all_sentences =  df[["sentence1", "sentence2"]]
    sentences = all_sentences.values.flatten().tolist()
    n = len(sentences)

    for s in sentences:
        for token in set(s):
            document_frequency_dict[token] = document_frequency_dict.get(token, 0) + 1

    return document_frequency_dict, n

document_frequencies, num_documents = get_document_frequency(sts_train_stop)
num_documents

In [None]:
from gensim import models
word2vec_path = '../input/gnewsvector/GoogleNews-vectors-negative300.bin'
word2vec_model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
from collections import Counter
import math
def average_sentence_embedding(tokens, embedding_model):
    tokens = [i for i in tokens if i in embedding_model]

    if len(tokens) == 0:
        return []

    count = Counter(tokens)
    token_list = list(count)
    term_frequency = [count[i] / len(tokens) for i in token_list]

    inv_doc_frequency = [
        math.log(num_documents / (document_frequencies.get(i, 0) + 1)) for i in count
    ]

    word_embeddings = [embedding_model[token] for token in token_list]
    weights = [term_frequency[i] * inv_doc_frequency[i] for i in range(len(token_list))]
    return list(np.average(word_embeddings, weights=weights, axis=0))

In [None]:
from scipy.spatial import distance
def calculate_cosine_similarity(embedding1, embedding2):
    cosine_similarity = 1 - distance.cosine(embedding1, embedding2)
    return cosine_similarity

In [None]:
def average_word_embedding_cosine_similarity(df, embedding_model):
    df['sentence1_embedding'] = df.apply(lambda x: average_sentence_embedding(x.sentence1, embedding_model), axis=1)
    df['sentence2_embedding'] = df.apply(lambda x: average_sentence_embedding(x.sentence2, embedding_model), axis=1)

    df['predictions'] = df.apply(lambda x: calculate_cosine_similarity(x.sentence1_embedding, x.sentence2_embedding) if 
                                 (sum(x.sentence1_embedding) != 0 and sum(x.sentence2_embedding) != 0) else 0, axis=1)
    
    return df['predictions'].tolist()

In [None]:
y_pred = pd.DataFrame(average_word_embedding_cosine_similarity(
    sts_test_stop, word2vec_model
))
y_pred = np.round(MinMaxScaler(feature_range=(0, 5)).fit_transform(y_pred).flatten(), 3)
pred = pd.DataFrame(y_pred)
pred['index'] = pred.index
c = pred.columns
pred[[c[0], c[1]]] = pred[[c[1], c[0]]]
pred.columns = ['index', 'prediction']
pred.to_csv("STS-B.tsv", sep='\t', index=False)

In [None]:
from sklearn.preprocessing import MinMaxScaler
similarities = pd.DataFrame(average_word_embedding_cosine_similarity(
    sts_eval_stop, word2vec_model
))

similarities = MinMaxScaler(feature_range=(0, 5)).fit_transform(similarities).flatten()
similarities
np.corrcoef(similarities, sts_eval_stop['label'])

In [None]:
from scipy.stats import spearmanr
rho, p = spearmanr(similarities, sts_eval_stop['label'])
rho

In [None]:
sts_eval_stop

# Task3: Sentence-pair Classification

In [None]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, TimeDistributed, SimpleRNN, GlobalMaxPooling1D
from keras.metrics import AUC, Accuracy
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from tensorflow.keras import utils

In [None]:
!pip install datasets

## Importing the dataset

In [None]:
# train_data = pd.read_csv("../input/multinli-nyu/multinli_1.0/multinli_1.0_train.txt", sep='\t', error_bad_lines=False, engine='python')
# valid_data = pd.read_csv("../input/multinli-nyu/multinli_1.0/multinli_1.0_dev_matched.txt", sep='\t', error_bad_lines=False, engine='python')
# # test_data = pd.read_csv("../input/stanford-natural-language-inference-corpus/snli_1.0_test.csv")
# # validation_data = pd.read_csv("../input/stanford-natural-language-inference-corpus/snli_1.0_dev.csv")

# train_data = train_data[:40000]
# test_data = test_data[:10000]
# validation_data = validation_data[:10000]

# dataset = pd.concat([train_data, test_data, validation_data])

from datasets import load_dataset
# train_dataset = load_dataset('glue', 'mrpc', split='train')
# eval_dataset = load_dataset('glue', 'mrpc', split='validation')
# test_dataset = load_dataset('glue', 'mrpc', split='test')

# train_dataset = load_dataset('glue', 'rte', split='train')
# eval_dataset = load_dataset('glue', 'rte', split='validation')
# test_dataset = load_dataset('glue', 'rte', split='test')

train_dataset = load_dataset('glue', 'wnli', split='train')
eval_dataset = load_dataset('glue', 'wnli', split='validation')
test_dataset = load_dataset('glue', 'wnli', split='test')


train = pd.DataFrame(train_dataset)
eval = pd.DataFrame(eval_dataset)
test = pd.DataFrame(test_dataset)
stop_words = stopwords.words('english')

In [None]:
test.shape

In [None]:
# Extended dataset loading
train = pd.read_csv('../input/kdextended/extended-dataset/wnli_extended.tsv', sep='\t')

In [None]:
train = train.groupby('label', group_keys=False).apply(lambda x: x.sample(943)).sample(frac=1)

In [None]:
train['label'].value_counts()

In [None]:
train.head()

## Preprocessing

In [None]:
dataset = train[['label', 'sentence1', 'sentence2']]
dataset.dropna(axis=0, inplace=True)

valid_data = eval[['label', 'sentence1', 'sentence2']]
valid_data.dropna(axis=0, inplace=True)

In [None]:
dataset = dataset.loc[dataset['label'] != "-"]
# dataset = dataset.loc[dataset['gold_label'] != "neutral"]
dataset = dataset.sample(frac = 1)

valid_data = valid_data.loc[valid_data['label'] != "-"]
# valid_data = valid_data.loc[valid_data['gold_label'] != "neutral"]
valid_label = valid_data['label']

sentence1 = dataset['sentence1']
sentence2 = dataset['sentence2']

label = dataset['label']

In [None]:
def CleanFeatures(sentences):
  sentences = sentences.apply(lambda sequence:
                                            [ltrs.lower() for ltrs in sequence if ltrs not in string.punctuation])
  sentences = sentences.apply(lambda wrd: ''.join(wrd))
  sentences = sentences.apply(lambda sequence:
                                            [word for word in sequence.split() if word not in stop_words])
  sentences = sentences.apply(lambda wrd: ' '.join(wrd))
  return sentences

sentence1 = CleanFeatures(sentence1)
sentence2 = CleanFeatures(sentence2)
valid_sentence1 = CleanFeatures(valid_data['sentence1'])
valid_sentence2 = CleanFeatures(valid_data['sentence2'])

In [None]:
all_sentences = np.asarray([sentence1, sentence2])
all_sentences = all_sentences.reshape(-1,1 )
all_sentences = all_sentences.reshape(all_sentences.shape[0])

In [None]:
all_sentences.shape

In [None]:
tokenizer = Tokenizer(num_words=6000)
tokenizer.fit_on_texts(all_sentences)
sentence1_seq = tokenizer.texts_to_sequences(sentence1)
sentence1 = pad_sequences(sentence1_seq, maxlen = 100)

sentence2_seq = tokenizer.texts_to_sequences(sentence2)
sentence2 = pad_sequences(sentence2_seq, maxlen = 100)

valid_sentence1_seq = tokenizer.texts_to_sequences(valid_sentence1)
valid_sentence1 = pad_sequences(valid_sentence1_seq, maxlen = 100)

valid_sentence2_seq = tokenizer.texts_to_sequences(valid_sentence2)
valid_sentence2 = pad_sequences(valid_sentence2_seq, maxlen = 100)

In [None]:
label_ = LabelEncoder()
labels = label_.fit_transform(label)
vocabulary = len(tokenizer.word_index)
labels = to_categorical(labels)

valid_labels = label_.transform(valid_label)
valid_labels = to_categorical(valid_labels)

In [None]:
all_sentences = np.asarray([sentence1, sentence2])
all_sentences.shape

In [None]:
valid_sentences = np.asarray([valid_sentence1, valid_sentence2])
valid_sentences.shape

In [None]:
all_sentences = all_sentences.reshape(all_sentences.shape[1], 2, all_sentences.shape[2])
all_sentences.shape

In [None]:
valid_sentences = valid_sentences.reshape(valid_sentences.shape[1], 2, valid_sentences.shape[2])
valid_sentences.shape

In [None]:
def glove_word_embedding(file_name, vocabulary):
  embeddings_index = {}
  file_ = open(file_name)
  for line in file_:
      arr = line.split()
      single_word = arr[0]
      w = np.asarray(arr[1:],dtype='float32')
      embeddings_index[single_word] = w
  file_.close()
  max_words = vocabulary + 1
  word_index = tokenizer.word_index
  embedding_matrix = np.zeros((max_words,50)).astype(object)
  for word , i in word_index.items():
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
              embedding_matrix[i] = embedding_vector 
  return embedding_matrix

In [None]:
vocabulary

In [None]:
# model = Sequential()
# D=100
# model.add(Embedding(vocabulary+1, D, input_length=train_text.shape[1]))
# lstm_out = 10
# model.add(Bidirectional(LSTM(lstm_out)))
# model.add(Dropout(.1, input_shape=(10,)))
# model.add(Dense(5, activation='relu'))
# model.add(Dropout(.1, input_shape=(5,)))
# model.add(Dense(2, activation='softmax'))
# model.compile(loss='binary_crossentropy', optimizer='RMSProp', metrics=['accuracy'])
# model.summary()
# model.compile(loss="binary_crossentropy",optimizer='adam',metrics=['accuracy'])
# training = model.fit(train_text,labels,
#                          epochs = 10,
#                          use_multiprocessing=True,
#                          workers=-1,
# #                          validation_split=0.1,
# #                          callbacks=[callback],
# #                          validation_data=(valid_text,valid_labels)
#                     )

## Building the deep learning model

In [None]:
# model_rnn = tf.keras.models.Sequential()
# model_rnn.add(Embedding(vocabulary + 1, 50, input_shape=(all_sentences.shape[1], all_sentences.shape[2],)))
# model_rnn.add(tf.keras.layers.TimeDistributed(SimpleRNN(128, return_sequences=True)))
# model_rnn.add(tf.keras.layers.Dropout(0.10))
# model_rnn.add(tf.keras.layers.TimeDistributed(SimpleRNN(128, return_sequences=True)))
# model_rnn.add(tf.keras.layers.Dropout(0.10))
# model_rnn.add((tf.keras.layers.TimeDistributed(GlobalMaxPooling1D())))
# model_rnn.add(tf.keras.layers.Flatten())
# model_rnn.add(tf.keras.layers.Dense(2, activation='softmax'))
# model_rnn.layers[0].set_weights([glove_word_embedding("../input/glove6b50dtxt/glove.6B.50d.txt", vocabulary)])
# model_rnn.layers[0].trainable = False
# from keras.optimizers import RMSprop
# model_rnn.compile(loss="binary_crossentropy",optimizer='Adam',metrics=['accuracy'])
# model_rnn.summary()
# utils.plot_model(model_rnn, show_shapes=False, expand_nested=True, to_file='sentence-pair.png', show_layer_names=False)

In [None]:
x = Input(shape=(all_sentences.shape[1], all_sentences.shape[2],))
p = TimeDistributed(Embedding(vocabulary + 1, 50))(x)
p = TimeDistributed(LSTM(64, return_sequences=True))(p)
p = TimeDistributed(LSTM(32, return_sequences=True))(p)
x1 = TimeDistributed(tf.keras.layers.GlobalMaxPooling1D())(p)
x2 = TimeDistributed(tf.keras.layers.GlobalAveragePooling1D())(p)
concat = tf.keras.layers.concatenate([x1, x2])
p = tf.keras.layers.Dropout(0.35)(concat)
p = Flatten()(p)
p = Dense(2, activation="softmax")(p)

m = Model(inputs = x, outputs = p)
m.layers[1].set_weights([glove_word_embedding("../input/glove6b50dtxt/glove.6B.50d.txt", vocabulary)])
m.layers[1].trainable = False
m.compile(loss="binary_crossentropy",optimizer='adam',metrics=["accuracy",])

## Training the model (RNN)

In [None]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None): 
        accuracy = logs["accuracy"]
        if accuracy >= self.threshold:
            self.model.stop_training = True
            
callback=MyThresholdCallback(threshold=0.90)
training = m.fit(all_sentences,labels,
                         epochs = 20,
                         use_multiprocessing=True,
                         workers=-1,
                         callbacks=[callback],
#                          validation_data=(valid_sentences,valid_labels)
                    )

In [None]:
m.evaluate(valid_sentences, valid_labels, batch_size=1024, workers=-1)

In [None]:
test_sentence1 = CleanFeatures(test['sentence1'])
test_sentence2 = CleanFeatures(test['sentence1'])

test_sentence1_seq = tokenizer.texts_to_sequences(test_sentence1)
test_sentence1 = pad_sequences(test_sentence1_seq, maxlen = 100)

test_sentence2_seq = tokenizer.texts_to_sequences(test_sentence2)
test_sentence2 = pad_sequences(test_sentence2_seq, maxlen = 100)

test_sentences = np.asarray([test_sentence1, test_sentence2])
test_sentences = test_sentences.reshape(test_sentences.shape[1], 2, test_sentences.shape[2])
test_sentences.shape

In [None]:
y_pred = m.predict(test_sentences, batch_size=1024, verbose= 1)
y_pred = np.argmax(y_pred, axis = 1)
pred = pd.DataFrame(y_pred)
pred['index'] = pred.index
c = pred.columns
pred[[c[0], c[1]]] = pred[[c[1], c[0]]]
pred.columns = ['index', 'prediction']
pred.to_csv("WNLI_nokd.tsv", sep='\t', index=False)

In [None]:
# for rte
y_pred = m.predict(test_sentences, batch_size=512, verbose= 1)
y_pred = np.argmax(y_pred, axis = 1)
def labelize(num):
    if num == 0:
        return 'not_entailment'
    else:
        return 'entailment'
pred = pd.DataFrame(y_pred)
pred['index'] = pred.index
c = pred.columns
pred[[c[0], c[1]]] = pred[[c[1], c[0]]]
pred.columns = ['index', 'prediction']
pred['prediction'] = pred['prediction'].apply(lambda x: labelize(x))
pred.to_csv("RTE_kd.tsv", sep='\t', index=False)

## Evaluation

In [None]:
plot_training(training, 'mrpc.pdf')