# Stephen Lanna
# Assignment 3
# NLP
# University of Southern Maine

Example of training fastText model and getting sentence embeddings

In [32]:
import numpy as np
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
from gensim.models import FastText
from scipy import spatial

def get_sentence_embedding(model, sentence):
  # This method takes in the trained model and the input sentence
  # and returns the embedding of the sentence as the average embedding
  # of its words
  words = sentence.split(" ")
  vector = model.wv[words[0]].copy()
  for i in range(1, len(words)):
    vector += model.wv[words[i]]
  return vector/len(words)



0.7853913903236389
0.8745558857917786


# Question 1

Reading Law Stack Exchange Data

In [34]:
import csv
from post_parser_record import PostParserRecord
import re

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test


def train_model(lst_sentences):
  model = FastText(sg=1,vector_size=100,window=5,min_n=1)
  model.build_vocab(lst_sentences)
  model.train(lst_sentences, total_examples=len(lst_sentences), epochs=10)
  model.save("my_model.model")

  return model



duplicate_file = "duplicate_questions.tsv"
post_file = "Posts_law.xml"
dic_similar_questions, lst_all_test = read_tsv_test_data(duplicate_file)
post_reader = PostParserRecord(post_file)

lst_training_sentences = []
for question_id in post_reader.map_questions:
  if question_id in lst_all_test:
    continue
  question = post_reader.map_questions[question_id]
  title = question.title
  body = question.body
  # Collect sentences here
  title_sentences = nltk.sent_tokenize(question.title)
  body_sentences = nltk.sent_tokenize(question.body)

  titles = []
  bodies = []
  for sentence in title_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        titles.append(words)
  for sentence in body_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        bodies.append(words)
  lst_training_sentences.extend(titles)
  lst_training_sentences.extend(bodies)
  
  lst_answers = question.answers

  if lst_answers is not None:
    for answer in lst_answers:
      answer_body = answer.body
      answer_sentences = nltk.sent_tokenize(answer_body)
      answers = []

      for sentence in answer_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        answers.append(words)
      lst_training_sentences.extend(answers)
# train your model

model = train_model(lst_training_sentences)

In [49]:
# Load our model
model = FastText.load("myFastText.model")

In [50]:
# Get embeddings
def get_embeddings_title_body(model, dictionary, map):
  embeddings = {}
  for id in dictionary:
    question = map[id]
    title = re.sub('<[^<]+?>', '', question.title)
    body = re.sub('<[^<]+?>', '', question.body)
    title_sentences = nltk.sent_tokenize(title)
    body_sentences = nltk.sent_tokenize(body)
    title_embeddings = []
    body_embeddings = []

    for sentence in title_sentences:
      title_embeddings.append(get_sentence_embedding(model, sentence))

    # Get the average title embedding
    average_title_embedding = np.mean(title_embeddings, axis=0)

    for sentence in body_sentences:
      body_embeddings.append(get_sentence_embedding(model, sentence))

    # Get the average body embedding
    average_body_embedding = np.mean(body_embeddings, axis=0)

    embeddings[id] = [average_title_embedding, average_body_embedding]
    #print(id)
  return embeddings

In [51]:
all_embeddings = get_embeddings_title_body(model, post_reader.map_questions, post_reader.map_questions)
positive_embeddings = get_embeddings_title_body(model, dic_similar_questions, post_reader.map_questions)

In [41]:
def cos_sim(v1, v2):
  return 1 - spatial.distance.cosine(v1, v2)

In [54]:
def update_most_similar(potential_max, current_max, potential_id, current_id):
  if potential_max > current_max:
    return potential_max, potential_id
  else:
    return current_max, current_id

In [107]:
import time
def get_similarities(model, all_embeddings, positive_embeddings, dic_similar_questions):
  # This dictionary will have the test question id as the key
  # and the most similar question id as the value
  dictionary_result_title = {}
  dictionary_result_body = {}

  # finding Similar questions using fastText model
  for test_question_id in dic_similar_questions:
      max_title_sim = -1
      max_body_sim = -1
      most_sim_title_id = -1
      most_sim_body_id = -1

      for embedding in all_embeddings:
        # we are not comparing a question with itself
        if embedding == test_question_id:
          continue

        # Get similarities for titles only
        #title_similarity = 1 - spatial.distance.cosine(all_positive_embeddings[test_question_id][0], all_embeddings[question_id][0])

        title_v1 = positive_embeddings[test_question_id][0]
        title_v2 = all_embeddings[embedding][0]
        #print(title_v1)
        #print(title_v2)

        title_sim = cos_sim(title_v1, title_v2)
        #print(title_sim)
        #time.sleep(20)
        #print(test_question_id, ":", title_sim)

        # Get similarities for bodies only
        body_v1 = positive_embeddings[test_question_id][1]
        body_v2 = all_embeddings[embedding][1]

        body_sim = cos_sim(body_v1, body_v2)
        #print(body_sim)
        #time.sleep(20)

        max_title_sim, most_sim_title_id = update_most_similar(title_sim, 
                                                               max_title_sim, 
                                                               embedding, 
                                                               most_sim_title_id)
        max_body_sim, most_sim_body_id = update_most_similar(body_sim,
                                                              max_body_sim, 
                                                              embedding, 
                                                              most_sim_body_id)
        #print(max_title_sim) 
        #print(max_body_sim) 
        #print(most_sim_title_id)
        #print(most_sim_body_id)
        #time.sleep(3)
      dictionary_result_title[test_question_id] = most_sim_title_id
      dictionary_result_body[test_question_id] = most_sim_body_id

  return dictionary_result_title, dictionary_result_body

In [108]:
dic_result_title, dic_result_body = get_similarities(model, all_embeddings, positive_embeddings, dic_similar_questions)

In [112]:
def p_at_1(dic, dic_similar_questions, type):
  sum = 0

  for id in dic:
    if dic[id] in dic_similar_questions:
      sum += 1
  
  p_at_1_avg = sum / len(dic)

  print(sum, "matches for question ", type)
  print("Average (", type, "): ", p_at_1_avg)

In [113]:
p_at_1(dic_result_title, dic_similar_questions, "titles")

4 matches for question  titles
Average ( titles ):  0.014184397163120567


In [114]:
p_at_1(dic_result_body, dic_similar_questions, "bodies")

3 matches for question  bodies
Average ( bodies ):  0.010638297872340425


# Question 2

In [115]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

import matplotlib.pyplot as plt


class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Define layers

        self.layer_1 = nn.Linear(input_dim, hidden_dim_1)

        self.relu_1 = nn.ReLU()

        self.layer_2 = nn.Linear(hidden_dim_1, hidden_dim_2)

        self.relu_2 = nn.ReLU()

        self.layer_3 = nn.Linear(hidden_dim_2, output_dim)

    def forward(self, x):
        # Your network forward pass

        # modify this line

        out = self.layer_1(x)
        out = self.relu_1(out)

        out = self.layer_2(out)
        out = self.relu_2(out)

        out = self.layer_3(out)

        return torch.sigmoid(out)

In [116]:
def plot_train_val_losses(train_losses, val_losses):
    # Plot the training and validation losses
    plt.plot(train_losses, label='Train')
    plt.plot(val_losses, label='Validation')
    plt.title('Training and Validation Losses')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [117]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# here goes your parameters
# sample code to define your model 
#model = FeedforwardNeuralNetModel(input_dim, hidden_dim_1, out_dim)
#model.to(devide)
input_dim = 100
hidden_dim_1 = 16
hidden_dim_2 = 32
hidden_dim_3 = 32
output_dim = 3
num_epochs = 10000
model = FeedforwardNeuralNetModel(input_dim, hidden_dim_1, hidden_dim_2, 
                                  hidden_dim_3, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1e-3)
model.to(device)
criterion = criterion.to(device)


def calculate_accuracy(y_true, y_pred):
  # this method will be used to calculate the accuracy of your model
    correct = (y_true.argmax (dim = 1) == y_pred.argmax (dim = 1)).float()
    acc = correct.sum() / len(correct)
    return acc
def training(tfidfX_train, Y_train, tfidfX_val, Y_val):
  # this method will be used for training your model
  # inputs are the training and validation sets
  # You can define batch size of your choice
  batch_size = 2000
  #print(type(tfidfX_train))
  X_train_mini_batches = torch.split(tfidfX_train, batch_size)
  Y_train_mini_batches = torch.split(Y_train, batch_size)
  train_losses = []
  train_accuracies = []
  val_losses = []
  val_accuracies = []
  best_accuracy = 0
  for epoch in tqdm(range(num_epochs)):
      epoch_loss = 0
      epoch_accuracy = 0
      validation_loss = 0
      val_accuracy = 0
      for X_train_mini_batch, Y_train_mini_batch in zip(X_train_mini_batches, Y_train_mini_batches):
          X_train_mini_batch = X_train_mini_batch.to(device)
          Y_train_mini_batch = Y_train_mini_batch.to(device)
          # Continue code here to train the network
          # here check your validation set
          # you have to save the model with the best loss or maybe accuracy?
          train_prediction = model.forward(X_train_mini_batch.float())
          train_prediction = torch.squeeze(train_prediction)
          train_loss = criterion(train_prediction, Y_train_mini_batch.float())
          optimizer.zero_grad()
          train_loss.backward()
          optimizer.step()
          epoch_loss += train_loss.item()
          epoch_accuracy += calculate_accuracy(Y_train_mini_batch, train_prediction)

      tfidfX_val = tfidfX_val.to(device)
      Y_val = Y_val.to(device)
      val_prediction = model.forward(tfidfX_val.float())
      val_prediction = torch.squeeze(val_prediction)
      val_loss = criterion(val_prediction, Y_val.float())
      validation_loss = val_loss.item()
      val_accuracy = calculate_accuracy(Y_val, val_prediction)
      if val_accuracy > best_accuracy:
        torch.save(model.state_dict(), "best_model_state.bin")
        best_accuracy = val_accuracy
      epoch_loss /= len(X_train_mini_batches)
      epoch_accuracy /= len(X_train_mini_batches)
      val_losses.append(validation_loss)
      train_losses.append(epoch_loss)
      train_accuracies.append(epoch_accuracy)

      #plot_train_val_losses(train_losses, val_losses)

      val_accuracies.append(val_accuracy)

In [118]:
#model.load_state_dict(torch.load('best_model_state.bin'))

FileNotFoundError: ignored