In [None]:
import pandas as pd
import torch as T
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [None]:
import nltk
import re
import jellyfish

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nltk.download('omw-1.4')

In [None]:
def cleanString(string):
    return re.compile('\W+').sub(' ', string).strip()

In [None]:
def meteorScore(s1, s2):
    s1 = word_tokenize(s1)
    s2 = word_tokenize(s2)
    
    return nltk.translate.meteor_score.single_meteor_score(s1, s2)

In [None]:
def cosineSimilarity(s1, s2):
    vectorizer = CountVectorizer()
    cos_vectors = vectorizer.fit_transform([s1, s2])
    return cosine_similarity(cos_vectors[0], cos_vectors[1])[0][0]

In [None]:
def ngrams(s1, s2, n):
    t1 = word_tokenize(s1)
    t2 = word_tokenize(s2)
    
    ngrams1 = nltk.ngrams(t1, n)
    ngrams2 = nltk.ngrams(t2, n)
    
    overlap = set(ngrams1).intersection(set(ngrams2))
    
    return len(overlap)

In [None]:
def jaccard(s1, s2):
    s1 = word_tokenize(s1)
    s2 = word_tokenize(s2)
    
    intersection = len(set(s1).intersection(s2))
    union = len(s1 + s2) - intersection
    
    return float(intersection) / union

In [None]:
def sorensonDice(s1, s2):
    s1 = set(s1)
    s2 = set(s2)
    
    return (2 * len(s1.intersection(s2))) / (len(s1) + len(s2))

In [None]:
def amntOverlap(s1, s2):
    s1 = s1.split()
    s2 = s2.split()
    
    return len(set(s1).intersection(set(s2)))

In [None]:
#preprocess dataFrame and add features
def preprocess(txtFile):
    columns = ['id', 's1', 's2', 'gold label']
    df = pd.read_csv(txtFile, sep = '\t+', names = columns, engine = 'python')
    
    df['s1'] = df.apply(lambda row: cleanString(row['s1']), axis = 1)
    df['s2'] = df.apply(lambda row: cleanString(row['s2']), axis = 1)
    
    df['length difference'] = df.apply(lambda row: abs(len(row['s1'].split(" ")) - len(row['s2'].split(" "))), axis = 1) 
    df['lev distance'] = df.apply(lambda row: nltk.edit_distance(row['s1'], row['s2']), axis = 1)
    df['meteor score'] = df.apply(lambda row: meteorScore(row['s1'], row['s2']), axis = 1)
    df['bleu 1'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2']), axis = 1)
    df['bleu 2'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2'], weights = (0.5, 0.5)), axis = 1)
    df['bleu 3'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2'], weights = (1/3, 1/3, 1/3)), axis = 1)
    df['bleu 4'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2'], weights = (0.25, 0.25, 0.25, 0.25)), axis = 1)
    df['cosine similarity'] = df.apply(lambda row: cosineSimilarity(row['s1'], row['s2']), axis = 1)
    df['2 ngrams'] = df.apply(lambda row: ngrams(row['s1'], row['s2'], 2), axis = 1)
    df['3 ngrams'] = df.apply(lambda row: ngrams(row['s1'], row['s2'], 3), axis = 1)
    df['jaccard similarity'] = df.apply(lambda row: jaccard(row['s1'], row['s2']), axis = 1)
    df['sorenson dice score'] = df.apply(lambda row: sorensonDice(row['s1'], row['s2']), axis = 1)
    df['jaro winkler dist.'] = df.apply(lambda row: jellyfish.jaro_winkler(row['s1'], row['s2']), axis = 1)
    df['# overlap'] = df.apply(lambda row: amntOverlap(row['s1'], row['s2']), axis = 1)
    
    return df

In [None]:
#preprocess test Dataframe
def preprocessTest(testFile):
    columns = ['instance id', 's1', 's2']
    df = pd.read_csv(testFile, sep = '\t+', names = columns, engine = 'python')
    
    df['s1'] = df.apply(lambda row: cleanString(row['s1']), axis = 1)
    df['s2'] = df.apply(lambda row: cleanString(row['s2']), axis = 1)
    
    df['length difference'] = df.apply(lambda row: abs(len(row['s1'].split(" ")) - len(row['s2'].split(" "))), axis = 1) 
    df['lev distance'] = df.apply(lambda row: nltk.edit_distance(row['s1'], row['s2']), axis = 1)
    df['meteor score'] = df.apply(lambda row: meteorScore(row['s1'], row['s2']), axis = 1)
    df['bleu 1'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2']), axis = 1)
    df['bleu 2'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2'], weights = (0.5, 0.5)), axis = 1)
    df['bleu 3'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2'], weights = (1/3, 1/3, 1/3)), axis = 1)
    df['bleu 4'] = df.apply(lambda row: sentence_bleu([row['s1']], row['s2'], weights = (0.25, 0.25, 0.25, 0.25)), axis = 1)
    df['cosine similarity'] = df.apply(lambda row: cosineSimilarity(row['s1'], row['s2']), axis = 1)
    df['2 ngrams'] = df.apply(lambda row: ngrams(row['s1'], row['s2'], 2), axis = 1)
    df['3 ngrams'] = df.apply(lambda row: ngrams(row['s1'], row['s2'], 3), axis = 1)
    df['jaccard similarity'] = df.apply(lambda row: jaccard(row['s1'], row['s2']), axis = 1)
    df['sorenson dice score'] = df.apply(lambda row: sorensonDice(row['s1'], row['s2']), axis = 1)
    df['jaro winkler dist.'] = df.apply(lambda row: jellyfish.jaro_winkler(row['s1'], row['s2']), axis = 1)
    df['# overlap'] = df.apply(lambda row: amntOverlap(row['s1'], row['s2']), axis = 1)
    
    return df

In [None]:
#create class for testing datasets
class TestDataSet(Dataset):
    def __init__(self, path):
        df = preprocessTest(path)
        self.X = T.tensor(df.iloc[:, 3:].values, dtype = T.float32)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return T.tensor(self.X[idx])

In [None]:
#create class for preprocessing training/validation datasets
class ParaphraseDataSet(Dataset):
    def __init__(self, path):
        df = preprocess(path)
        self.X = T.tensor(df.iloc[:, 4:].values, dtype = T.float32)
        self.y = T.tensor(df.iloc[:, 3].values, dtype = T.float32)
        self.y = self.y.reshape(-1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

In [None]:
#Create Multilayer Perceptron
class MLPNet(T.nn.Module):
    def __init__(self, input_size): #input layer perceptrons = # of features
        super(MLPNet, self).__init__()
        self.input_size = input_size
        self.hidden_1 = T.nn.Linear(input_size, input_size*2)
        self.hidden_2 = T.nn.Linear(input_size*2, input_size*2)
        self.hidden_3 = T.nn.Linear(input_size*2, input_size*2)
        self.output = T.nn.Linear(input_size*2, 1)
        
        #initialize weights and biases using xavier uniform distribution (random)
        T.nn.init.xavier_uniform_(self.hidden_1.weight)
        T.nn.init.zeros_(self.hidden_1.bias)
        
        T.nn.init.xavier_uniform_(self.hidden_2.weight)
        T.nn.init.zeros_(self.hidden_2.bias)
        
        T.nn.init.xavier_uniform_(self.hidden_3.weight)
        T.nn.init.zeros_(self.hidden_3.bias)
        
        T.nn.init.xavier_uniform_(self.output.weight)
        T.nn.init.zeros_(self.output.bias)
       
    #FeedForward
    def forward(self, x):
        x = T.relu(self.hidden_1(x))
        x = T.tanh(self.hidden_2(x))
        x = T.relu(self.hidden_3(x))
        x = T.sigmoid(self.output(x))
        return x

In [None]:
#DataLoaders
training = ParaphraseDataSet('/kaggle/input/mlfinalprojectdataset/train_with_label.txt')
validation = ParaphraseDataSet('/kaggle/input/mlfinalprojectdataset/dev_with_label.txt')
testing = TestDataSet('/kaggle/input/mlfinalprojectdataset/test_without_label.txt')

In [None]:
train_loader = T.utils.data.DataLoader(dataset = training, batch_size = 1000, shuffle = True)
validation_loader = T.utils.data.DataLoader(dataset = validation, batch_size = 1000, shuffle = False)
test_loader = T.utils.data.DataLoader(dataset = testing, batch_size = 1000, shuffle = False)

In [None]:
#Instantiate Model and set Hyperparameters
MLP_model = MLPNet(14)
learn_rate = 0.01
epochs = 1000
criterion = T.nn.BCELoss()
optimizer = T.optim.Adam(MLP_model.parameters(), lr = learn_rate, weight_decay = 0.005)     

In [None]:
# Train the model on the training data
min_valid_loss = np.inf
for epoch in range(epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = MLP_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validate the model on the validation data
    with T.no_grad():
        val_loss = 0
        for inputs, labels in validation_loader:
            outputs = MLP_model(inputs)
            val_loss += criterion(outputs, labels)
        val_loss /= len(validation_loader)
        

    # Print the loss on the training and validation data
    print(f"Epoch {epoch+1}: Train loss: {loss.item():.4f} Validation loss: {val_loss.item():.4f}")
    
    if min_valid_loss > val_loss:
        print(f"Epoch: {epoch+1} Validation Loss decreased: {min_valid_loss:.4f} -> {val_loss:.4f} Saving Model")
        min_valid_loss = val_loss
        T.save(MLP_model.state_dict(), "mlp_model.pt")

In [None]:
#Check F1
import sklearn.metrics

#Load Best Model from Validation Loss
MLP_model = MLPNet(14)
MLP_model.load_state_dict(T.load("mlp_model.pt"))

with T.no_grad():
    dev_y = []
    predicted_y = []
    for inputs, labels in validation_loader:
   
        outputs = MLP_model(inputs)
        output_probs = T.sigmoid(outputs)
        predicted = T.round(outputs)

        dev_y.extend(labels.tolist())
        predicted_y.extend(predicted.tolist())

f1 = sklearn.metrics.f1_score(dev_y, predicted_y)
accuracy = sklearn.metrics.accuracy_score(dev_y, predicted_y)

print(f"F1 score: {f1:.4f} Accuracy score: {accuracy:.4f}")

In [None]:
#Testing
MLP_model = MLPNet(14)
MLP_model.load_state_dict(T.load("mlp_model.pt"))
MLP_model.eval()

y_toFile = []

with T.no_grad():
    for inputs in test_loader:
        outputs = MLP_model(inputs)
        output_probs = T.sigmoid(outputs)
        predicted_y = T.round(outputs)
        
        for values in predicted_y.numpy().flatten():
            y_toFile.append(int(values))

In [None]:
df_test = preprocessTest('/kaggle/input/mlfinalprojectdataset/test_without_label.txt')

In [None]:
file = open('AnubhavKunduNN_test_result.txt', 'w')

for i in range(len(df_test['instance id'])):
    file.write(str(df_test['instance id'].values[i]) + '\t' + str(y_toFile[i]) + '\n')
file.close()