In [3]:
import os
import torch
import torch.nn as nn
import numpy as np
from pandas import *
import torch.nn.functional as F
import dictionary_corpus
from torch.autograd import Variable
from model import RNNModel
from collections import defaultdict

In [4]:
torch.manual_seed(1111)
np.random.seed(1111)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:

fn = "../../../data/lm/English/hidden650_batch128_dropout0.2_lr20.0.pt"
model_name = "Gulordava"

In [4]:
model_ = None
with open(fn, "rb") as model_f:
    model_ = torch.load(fn, map_location=torch.device('cpu'))



In [5]:
model = RNNModel("LSTM", 50001, 650, 650, 2, 0.2, False)
model.eval()
model.load_state_dict(model_.state_dict())

<All keys matched successfully>

In [6]:
# path to data files
data_path = "../../../data/lm/English"

In [7]:
dictionary = dictionary_corpus.Dictionary(data_path)

In [8]:
def check_vocab(word_list):
    unknown = set()
    for w in word_list:
        try:
            idx = dictionary.word2idx[w]
        except KeyError:
            unknown.add(w)
    print(unknown)
    print(len(unknown), "words are not in the model's vocabulary")

In [9]:
def check_csv(filename):
    data = read_csv(filename, delimiter = ';')
    words = []
    for (colname, colval) in data.iteritems():
        if colname != "item":
            for col in colval.values:
                words_col = col.split()
                for w in words_col:
                    words.append(w)
    check_vocab(words)

Function that assigns surprisal values to each word in a sentence given a previous context:

In [10]:
def sent_surprisal(prompt):
    """
    prompt: list with words (including punctuation)
    return: list with surprisal values
    """
    np.random.seed(1111)
    # prompt should be a list with words, punctuation and <eos>
    #surprisal_arr = []  
    surprisal_arr = [0]  # surprisal for initial position already added
    indices = [dictionary.word2idx[w] if w in dictionary.word2idx
               else dictionary.word2idx["<unk>"]
               for w in prompt]
    indices = torch.tensor(indices, dtype=torch.long)
    output, hidden = model(indices.view(-1, 1),  # Remember, (sequence_length, batch_size)
                           model.init_hidden(1))  # one input at a time, thus batch_size = 1
    #for position, next_word in enumerate(prompt[:-1]): 
    for position, next_word in enumerate(prompt[1:-1]):  # excluding surprisal at the first and last positions
        current_word_scores = output[position].view(-1)  # the output vector corresponding to the current word
        current_word_probs = F.log_softmax(current_word_scores, dim=0) # (log) softmax the score to get probabilities
        next_word_prob = current_word_probs[dictionary.word2idx[next_word]] # get the probability of the true next word
        surprisal = next_word_prob*(-1)  # item gives you an integer from a tensor that has one element
        surprisal_arr.append(surprisal.item())
    surprisal_arr.append(0)  # surprisal for <eos> given punctuation
    # print(len(surprisal_arr) == len(prompt))  # True
    return surprisal_arr

In [11]:
def get_surprisal_values(data):
    surprisal_values = []
    end_idx = data.loc[data['word'] == '<eos>'].index.to_list()  # list with idx of rows that contain <eos>
    end_idx = [-1,*end_idx]  # inserting -1 as the start index to get the first sentence right
    for i in range(len(end_idx)-1):
        sent_range = range(end_idx[i]+1, end_idx[i+1]+1)
        sent_words = data.iloc[sent_range]['word'].to_list()
        surprisal_arr = sent_surprisal(sent_words)
        for s in surprisal_arr:
            surprisal_values.append(s)
    return surprisal_values

**Function for the analysis of distance and syntactic position:**

In [12]:
def filename_from_dataset(dataset, model_name):
    # result_filename = '../results' + dataset[17:-4] + '_result_' + model_name + '.csv'
    result_filename = '../results' + dataset[17:-4] + '_result' + '.csv' # when working with one model
    print(result_filename)
    return result_filename

In [13]:
def analyze_data(dataset, model_name):
    words = []
    data = read_csv(dataset, index_col=0)
    for index, row in data.iterrows():
        words.append(row['word'])
    check_vocab(words)
    surprisal_values = get_surprisal_values(data)
    data['surprisal'] = surprisal_values
    #unk = ['FALSE']*len(surprisal_values)
    #data['unk'] = unk
    #model = [model_name]*len(surprisal_values)
    #data['model'] = model
    data["dependency"] = "Wh"
    data["language"] = "English"
    result = filename_from_dataset(dataset, model_name)
    data.to_csv(result, encoding="utf-8-sig", index=False)
    return data

In [14]:
eq = analyze_data('../test_sentences/jml_sentences/eq_subj_wh_en.csv', model_name)
whether = analyze_data('../test_sentences/jml_sentences/whether_wh_en.csv', model_name)
subject = analyze_data('../test_sentences/jml_sentences/subject_wh_en.csv', model_name)
unbound = analyze_data('../test_sentences/jml_sentences/unbound_wh_en.csv', model_name)
eq["dependency"] = "Wh"
eq["language"] = "English"
whether["dependency"] = "Wh"
whether["language"] = "English"
subject["dependency"] = "Wh"
subject["language"] = "English"
unbound["dependency"] = "Wh"
unbound["language"] = "English"

{'<bos>'}
1 words are not in the model's vocabulary
../results/jml_sentences/eq_subj_wh_en_result.csv
{'<bos>'}
1 words are not in the model's vocabulary
../results/jml_sentences/whether_wh_en_result.csv
{'<bos>'}
1 words are not in the model's vocabulary
../results/jml_sentences/subject_wh_en_result.csv
{'<bos>'}
1 words are not in the model's vocabulary
../results/jml_sentences/unbound_wh_en_result.csv


In [15]:
unbound.to_csv('../results/unbound_result_en_jml.csv', encoding="utf-8-sig", index=False)
whether.to_csv('../results/whether_result_en_jml.csv', encoding="utf-8-sig", index=False)
subject.to_csv('../results/subject_result_en_jml.csv', encoding="utf-8-sig", index=False)
eq.to_csv('../results/eq_subj_result_en_jml.csv', encoding="utf-8-sig", index=False)