In [1]:
from model.utils.load_utils import prepare_data

In [2]:
voc, train_pairs, vector = prepare_data('train', 'glove.42B.300d/glove.42B.300d.txt', small=True)

Reading train -------
Read 45533 sentence pairs
Counting words
Counted words:
In vocabulary: 22970 words


In [None]:
for i in range(len(train_pairs)):
    if train_pairs[i][0] == ' ':
        print(i)

In [3]:
train_pairs[22645:22651]

[['how long will it take to get the results ?',
  "we will send your doctor the results and he will contact you . eou ' EOS"],
 ["'hello . i need to disconnect my phone please .",
  'all right . where do you live sir ? EOS'],
 ['at 345 lincoln avenue . oklahoma city .',
  'very well . why do you want to disconnect your phone sir ? EOS'],
 ['i m moving to a new home .', 'o . k . may i have your name please ? EOS'],
 ['john smith .',
  'thank you . mr . smith . what s your telephone number ? EOS'],
 ['555 7658', 'thank you . where should i send your final phone bill ? EOS']]

In [None]:
import re
import unicodedata
import numpy as np


class Voc:
    
    def __init__(self, name, word2index):
        self.name = name
        # Create dict of word: 1 (count) for the words in the GloVe vocabulary
        self.word_count = {word: 1 for word in word2index.keys()}
        # Import the word: index created from load glove embbedding
        self.word2index = word2index
        self.n_words = len(word2index.keys())
        # Reverse index and words 
        self.index2word = {v: k for k, v in word2index.items()}
        
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word_count[word] = 1
            self.n_words += 1
        else:
            self.word_count[word] += 1
    
    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word)



def load_glove(file_path, small=True):
    idx = 4
    vectors = {}
    word2idx = {}
    with open(file_path, encoding='utf8') as lines:
        for line in lines:
            # Load only 10000 words if small is called
            if small and idx > 10000:
                break
            # Split the line at the spaces and create a list where first is word and next is the word embedding vectors
            line = line.split()
            # Assign dict key to the word in the line and value an index 
            word2idx[line[0].lower()] = idx
            # Assign dict key to the word in the line and value a numpay array of the word (embedding from GloVe) 
            vectors[line[0].lower()] = np.array(list(line[1:]), dtype='float')
            idx += 1
            embed_dim = len(list(line[1:]))
    vectors[0] = np.random.normal(scale=0.6, size=(embed_dim, ))
    vectors[1] = np.random.normal(scale=0.6, size=(embed_dim, ))
    vectors[2] = np.random.normal(scale=0.6, size=(embed_dim, ))
    vectors[3] = np.random.normal(scale=0.6, size=(embed_dim, ))
    word2idx['PAD'] = 0
    word2idx['SOS'] = 1
    word2idx['EOS'] = 2
    word2idx['UNK'] = 3
    return vectors, word2idx




# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def UnicodeToASCII(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')


def sentence_cleaning(sentence):
    # Transforms to ASCII, lower case and strip blank spaces
    sentence = UnicodeToASCII(sentence)
    # Get rid of double puntuation
    sentence = re.sub(r"([.!?]+)\1", r"\1", sentence.lower().strip())
    # Get rid of non-letter character
    sentence = re.sub(r"[^a-zA-Z.!?']+", r" ", sentence)
    return sentence


def load_file(name):
    lines = open(f'data/{name}/dialogues_{name}.txt', encoding='utf-8').read().strip().split('\n')
    return lines


def Read_data(dataset,  glove_file_path, small):
    
    print(f'Reading {dataset} -------')
    # Load one of the three datasets train, test or validation and return a list of all the lines
    lines = load_file(dataset)
    # Split each line into sentence and create a list of list
    list_sentences = [[sentence for sentence in line.split('__eou__')] for line in lines]
    # Assumes odd sentences being the source aka question and even sentences the target aka answer, still in a list of list format
    source_sentences_list = [[source for source in sentence if sentence.index(source)%2 == 0] for sentence in list_sentences]
    target_sentences_list = [[source for source in sentence if sentence.index(source)%2 != 0] for sentence in list_sentences]

    for sentence_list in source_sentences_list:
        try:
            sentence_list.remove('')
        except:
            continue

        try:
            sentence_list.remove(' ')
        except:
            continue

    # Flattens the list to have all the questions in one list
    source_sentences = [sentence for row in source_sentences_list for sentence in row]
    # Flattens the list to have all the answers in one list
    target_sentences = [sentence for row in target_sentences_list for sentence in row]
    # Creates a pair of question-answer as a list of list
    pairs = [[sentence_cleaning(question), sentence_cleaning(answer)] for question, answer in zip(source_sentences, target_sentences)]
    # Pad empty sentences
    #pairs = [['EMPTY', line[1]] if line[0] == '' else line for line in pairs]
    pairs = [[line[0], 'EMPTY'] if line[1] == '' else line for line in pairs]
    # Pad spaces
    #pairs = [['EMPTY', line[1]] if line[0] == ' ' else line for line in pairs]
    pairs = [[line[0], 'EMPTY'] if line[1] == ' ' else line for line in pairs]
    # Load GloVe vectors
    glove_vectors, glove_word2idx = load_glove(glove_file_path, small)
    # Initialize the classes questions and answers to assign indexes and count the words
    vocabulary = Voc('vocabulary', glove_word2idx)

    return vocabulary, pairs, glove_vectors


def prepare_data(dataset, glove_file_path, small=True):
    voc, pairs, word_vector = Read_data(dataset, glove_file_path, small)
    # Adding EOS in answers
    pairs = [[line[0], line[1]+' EOS'] for line in pairs]
    print(f'Read {len(pairs)} sentence pairs')
    print('Counting words')
    for pair in pairs:
        voc.add_sentence(pair[0])
        voc.add_sentence(pair[1])
    
    print('Counted words:')
    print(f'In {voc.name}: {voc.n_words} words')
    
    return voc, pairs, word_vector

In [None]:
lines = load_file('train')
# Split each line into sentence and create a list of list
list_sentences = [[sentence for sentence in line.split('__eou__')] for line in lines]
# Assumes odd sentences being the source aka question and even sentences the target aka answer, still in a list of list format
source_sentences_list = [[source for source in sentence if sentence.index(source)%2 == 0] for sentence in list_sentences]
target_sentences_list = [[source for source in sentence if sentence.index(source)%2 != 0] for sentence in list_sentences]


In [None]:
for sentence_list in source_sentences_list:
        try:
            sentence_list.remove('')
            sentence_list.remove(' ')
        except:
            continue

In [None]:
for i in range(len(target_sentences_list)):
    for j in range(len(target_sentences_list[i])):
        if target_sentences_list[i][j] == ' ':
            print(target_sentences_list[i])

In [None]:
target_sentences_list

In [None]:
source_sentences = [sentence for row in source_sentences_list for sentence in row]

In [None]:
source_sentences[6]

In [None]:
target_sentences = [sentence for row in target_sentences_list for sentence in row]

In [None]:
target_sentences[6]

In [None]:
x = zip(source_sentences, target_sentences)

In [None]:
print(tuple(x))

In [None]:
[[question, answer] for question, answer in zip(source_sentences, target_sentences)]

In [None]:
lines = load_file('train')

In [None]:
list_sentences = [[sentence for sentence in line.split('\n')] for line in lines]

In [None]:
list_sentences = [[sentence for sentence in str(line).strip(']["').split(' __eou__ ')] for line in list_sentences]

In [None]:
#list_sentences = [[sentence for sentence in line.split(' __eou__')] for line in lines]

In [None]:
for sentence in list_sentences:
    if len(sentence)%2 != 0:
        sentence.append('EMPTY')

In [None]:
len(list_sentences[1060])

In [None]:
list_sentences[1060]

In [None]:
source_sentences_list = [[source for source in sentence if sentence.index(source)%2 == 0] for sentence in list_sentences]
target_sentences_list = [[source for source in sentence if sentence.index(source)%2 != 0] for sentence in list_sentences]

In [None]:
source_sentences_list = []
for idx in range(len(list_sentences)):
    source_list = []
    for u_idx in range(len(list_sentences[idx])):
        if u_idx%2 == 0:
            source_list.append(list_sentences[idx][u_idx])
        
    source_sentences_list.append(source_list)

In [None]:
target_sentences_list = []
for idx in range(len(list_sentences)):
    target_list = []
    for u_idx in range(len(list_sentences[idx])):
        if u_idx%2 != 0:
            target_list.append(list_sentences[idx][u_idx])
        
    target_sentences_list.append(target_list)

In [None]:
source_sentences_list[1060]

In [None]:
target_sentences_list[1060]

In [None]:
source_sentences_list[12]

In [None]:
target_sentences_list[12]

In [None]:
import pandas as pd

In [None]:
data = pd.Series(source_sentences_list)

In [None]:
data.head()

In [None]:
data2 = pd.Series(target_sentences_list)

In [None]:
data3 = pd.concat([data, data2], axis=1)

In [None]:
data3

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
data3['source_len'] = data3[0].apply(lambda x: len(x))
data3['target_len'] = data3[1].apply(lambda x: len(x))

In [None]:
data3.iloc[11116, :]

In [None]:
data3.iloc[12, :]

In [None]:
data3.iloc[3190, :]

In [None]:
data3.iloc[1060, :]

In [None]:
data3[data3['source_len'] != data3['target_len']]

In [None]:
list_sentences[0]

In [None]:
source_sentences_list

In [None]:
source_sentences = [sentence for row in source_sentences_list for sentence in row]
source_sentences

In [None]:
target_sentences_list = [[source for source in sentence if sentence.index(source)%2 != 0] for sentence in list_sentences]
target_sentences_list

In [None]:
target_sentences = [sentence for row in target_sentences_list for sentence in row]

In [None]:
[[question, answer] for question, answer in zip(source_sentences, target_sentences)]