In [1]:
#!pip install transformers
#!pip install datasets

In [2]:
# Dataframe and computation
import numpy as np
import pandas as pd

# Deep learning libraries
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
#NLTK and regex libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import string

#Sklearn libraries
from sklearn.model_selection import train_test_split

# Downloads for string cleaning
wn = nltk.WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ameyagidh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ameyagidh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Import and inspect data

In [3]:
df = pd.read_csv('../data/paws/train.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,sentence1,sentence2,label
0,0,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0
1,1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
2,2,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0
3,3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
4,4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1


## Function to clean strings

In [5]:
# Cleaning function for the strings
def clean_string(input_str):
    
    # Lowercase the input_string
    input_str = input_str.lower()
    
    # Remove URLs, links
    input_str = re.sub(r"http\S+", "", input_str)
    input_str = re.sub(r"www.\S+", "", input_str)
    input_str = re.sub(r"\S+@\S+", "", input_str)
    
    # Remove punctuations
    input_str_punc = "".join(char for char in input_str if char not in string.punctuation)

    # Remove stopwords
    stopword = nltk.corpus.stopwords.words('english')
    input_str_stopwords = " ".join([word for word in re.split('\W+', input_str_punc) if word not in stopword])
    
    # Lemmatization
    input_str_cleaned = " ".join([wn.lemmatize(word,'n') for word in re.split('\W+', input_str_stopwords)])

    return input_str_cleaned

### Apply cleaning function to data

In [6]:
df

Unnamed: 0.1,Unnamed: 0,sentence1,sentence2,label
0,0,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0
1,1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
2,2,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0
3,3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
4,4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1
...,...,...,...,...
49396,49396,"`` Our school is of spiritual and spiritual , ...",`` Our School is of the Temporal and the Spiri...,0
49397,49397,She was in Cork on June 24 and arrived on 8 Ju...,"She was at Cork on 24 June , and arrived in th...",1
49398,49398,Cornelia Stuyvesant Vanderbilt ( George and Ed...,John John F. A. Cecil ( the only child of Geor...,0
49399,49399,The third season was premiered on 7 June 2010 ...,"The fourth season was premiered on June 7 , 20...",0


In [7]:
df["question1"] = df["sentence1"].apply(lambda x: clean_string(str(x)))
df["question2"] = df["sentence2"].apply(lambda x: clean_string(str(x)))

### Split the testing and training data

In [8]:
train, test = train_test_split(df, test_size=0.3)

In [9]:
sent_1_train = train["question1"].values
sent_2_train = train["question2"].values
Y_train = train["label"].values

In [10]:
sent_1_test = test["question1"].values
sent_2_test = test["question2"].values
Y_test = test["label"].values

### tokenizing and padding training/testing data

In [11]:
tokenizer = Tokenizer(num_words = 200000)
tokenizer.fit_on_texts(list(sent_1_train)+list(sent_2_train))

In [12]:

sent_1_train = tokenizer.texts_to_sequences(sent_1_train)
sent_1_train_pad = pad_sequences(sent_1_train, maxlen = 30, padding='post')


sent_2_train = tokenizer.texts_to_sequences(sent_2_train)
sent_2_train_pad = pad_sequences(sent_2_train, maxlen = 30, padding='post')


In [13]:
sent_1_test = tokenizer.texts_to_sequences(sent_1_test)
sent_1_test_pad = pad_sequences(sent_1_test,maxlen = 30, padding='post')

sent_2_test = tokenizer.texts_to_sequences(sent_2_test)
sent_2_test_pad = pad_sequences(sent_2_test, maxlen = 30, padding='post')

### Create glove embeddings

In [14]:
word_index = tokenizer.word_index
embedding_index = {}
with open('../data/glove.6B.200d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()

In [15]:
embedding_matrix = np.random.random((len(word_index)+1, 200))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Tensorflow models

In [16]:
# Question 1 model
model_q1 = tf.keras.Sequential()
model_q1.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q1.add(LSTM(128, activation = 'relu', return_sequences = True))
model_q1.add(Dropout(0.25))
model_q1.add(LSTM(128, return_sequences = True))
model_q1.add(Dropout(0.25))
model_q1.add(Dense(64, activation = 'relu'))
model_q1.add(Dense(2, activation = 'sigmoid'))

In [17]:
# Quesiton 2 model
model_q2 = tf.keras.Sequential()
model_q2.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q2.add(LSTM(128, activation = 'relu', return_sequences = True))
model_q2.add(Dropout(0.25))
model_q2.add(LSTM(128, return_sequences = True))
model_q1.add(Dropout(0.25))
model_q2.add(Dense(64, activation = 'relu'))
model_q2.add(Dense(2, activation = 'sigmoid'))

In [18]:
# Merging model output
mergedOut = Multiply()([model_q1.output, model_q2.output])

mergedOut = Flatten()(mergedOut)
mergedOut = Dense(128, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.25)(mergedOut)
mergedOut = Dense(64, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.25)(mergedOut)
mergedOut = Dense(2, activation = 'sigmoid')(mergedOut)

# Train the model

In [19]:
new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
history = new_model.fit([sent_1_train_pad,sent_2_train_pad],Y_train, batch_size = 2000, epochs = 6,validation_data=([sent_1_test_pad,sent_2_test_pad],Y_test))

Epoch 1/6


2024-03-14 06:38:28.319974: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
