In [1]:
import random
import re
import unicodedata

In [2]:
lines = open('data/test/dialogues_test.txt', encoding='utf-8').read().strip().split('\n')

In [3]:
lines[0]

'Hey man , you wanna buy some weed ? __eou__ Some what ? __eou__ Weed ! You know ? Pot , Ganja , Mary Jane some chronic ! __eou__ Oh , umm , no thanks . __eou__ I also have blow if you prefer to do a few lines . __eou__ No , I am ok , really . __eou__ Come on man ! I even got dope and acid ! Try some ! __eou__ Do you really have all of these drugs ? Where do you get them from ? __eou__ I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free . __eou__ Sounds good ! Let ’ s see , I want . __eou__ Yeah ? __eou__ I want you to put your hands behind your head ! You are under arrest ! __eou__'

In [4]:
len(lines)

1000

In [5]:
list_sentences = [[sentence for sentence in line.split('__eou__')] for line in lines]

In [6]:
list_sentences[0]

['Hey man , you wanna buy some weed ? ',
 ' Some what ? ',
 ' Weed ! You know ? Pot , Ganja , Mary Jane some chronic ! ',
 ' Oh , umm , no thanks . ',
 ' I also have blow if you prefer to do a few lines . ',
 ' No , I am ok , really . ',
 ' Come on man ! I even got dope and acid ! Try some ! ',
 ' Do you really have all of these drugs ? Where do you get them from ? ',
 ' I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free . ',
 ' Sounds good ! Let ’ s see , I want . ',
 ' Yeah ? ',
 ' I want you to put your hands behind your head ! You are under arrest ! ',
 '']

In [7]:
source_sentences_list = [[source for source in sentence if sentence.index(source)%2 == 0] for sentence in list_sentences]

In [8]:
target_sentences_list = [[source for source in sentence if sentence.index(source)%2 != 0] for sentence in list_sentences]

In [9]:
source_sentences_list[0]

['Hey man , you wanna buy some weed ? ',
 ' Weed ! You know ? Pot , Ganja , Mary Jane some chronic ! ',
 ' I also have blow if you prefer to do a few lines . ',
 ' Come on man ! I even got dope and acid ! Try some ! ',
 ' I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free . ',
 ' Yeah ? ',
 '']

In [10]:
target_sentences_list[0]

[' Some what ? ',
 ' Oh , umm , no thanks . ',
 ' No , I am ok , really . ',
 ' Do you really have all of these drugs ? Where do you get them from ? ',
 ' Sounds good ! Let ’ s see , I want . ',
 ' I want you to put your hands behind your head ! You are under arrest ! ']

In [11]:
source_sentences = [sentence for row in source_sentences_list for sentence in row]

In [12]:
target_sentences = [sentence for row in target_sentences_list for sentence in row]

In [13]:
source_sentences[0]

'Hey man , you wanna buy some weed ? '

In [14]:
SOS_token = 'SOS'
EOS_token = 'EOS'

class QuestionAndAnswer:
    
    def __init__(self, name):
        self.name = name
        self.word_count = {}
        self.word2index = {}
        self.n_words = 2
        self.index2word = {0: 'SOS', 1: 'EOS'}
        
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word_count[word] = 1
            self.n_words += 1
        else:
            self.word_count[word] += 1
    
    def add_sentence(self, sentence):
        for word in sentence.split():
            return self.add_word(word)

In [15]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def UnicodeToASCII(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [16]:
UnicodeToASCII(source_sentences[0].lower().strip())

'hey man , you wanna buy some weed ?'

In [17]:
def sentence_cleaning(sentence):
    # Transforms to ASCII, lower case and strip blank spaces
    sentence = UnicodeToASCII(sentence)
    # Get rid of double puntuation
    sentence = re.sub(r"([.!?]+)\1", r"\1", sentence.lower().strip())
    # Get rid of non-letter character
    sentence = re.sub(r"[^a-zA-Z.!?]+", r" ", sentence)
    return sentence

In [18]:
re.sub(r"([.!?]+)\1", r"\1", 'Hey man , you wanna buy some weed ?? ')

'Hey man , you wanna buy some weed ? '

In [19]:
sentence_cleaning(source_sentences[0])

'hey man you wanna buy some weed ?'

In [20]:
def load_file(name):
    lines = open(f'data/{name}/dialogues_{name}.txt', encoding='utf-8').read().strip().split('\n')
    return lines

In [21]:
test_lines = load_file('test')

In [22]:
test_lines[0]

'Hey man , you wanna buy some weed ? __eou__ Some what ? __eou__ Weed ! You know ? Pot , Ganja , Mary Jane some chronic ! __eou__ Oh , umm , no thanks . __eou__ I also have blow if you prefer to do a few lines . __eou__ No , I am ok , really . __eou__ Come on man ! I even got dope and acid ! Try some ! __eou__ Do you really have all of these drugs ? Where do you get them from ? __eou__ I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free . __eou__ Sounds good ! Let ’ s see , I want . __eou__ Yeah ? __eou__ I want you to put your hands behind your head ! You are under arrest ! __eou__'

In [23]:
def Read_data(dataset):
    
    print(f'Reading {dataset} -------')
    # Load one of the three datasets train, test or validation and return a list of all the lines
    lines = load_file(dataset)
    # Split each line into sentence and create a list of list
    list_sentences = [[sentence for sentence in line.split('__eou__')] for line in lines]
    # Assumes odd sentences being the source aka question and even sentences the target aka answer, still in a list of list format
    source_sentences_list = [[source for source in sentence if sentence.index(source)%2 == 0] for sentence in list_sentences]
    target_sentences_list = [[source for source in sentence if sentence.index(source)%2 != 0] for sentence in list_sentences]
    # Flattens the list to have all the questions in one list
    source_sentences = [sentence for row in source_sentences_list for sentence in row]
    # Flattens the list to have all the answers in one list
    target_sentences = [sentence for row in target_sentences_list for sentence in row]
    # Creates a pair of question-answer as a list of list
    pairs = [[sentence_cleaning(question), sentence_cleaning(answer)] for question, answer in zip(source_sentences, target_sentences)]
    # Initialize the classes questions and answers to assign indexes and count the words
    questions = QuestionAndAnswer('questions')
    answers = QuestionAndAnswer('answers')
    
    return questions, answers, pairs

In [24]:
def prepare_data(dataset):
    q, a, pairs = Read_data(dataset)
    print(f'Read {len(pairs)} sentence pairs')
    print('Counting words')
    for pair in pairs:
        q.add_sentence(pair[0])
        a.add_sentence(pair[1])
    
    print('Counted words:')
    print(f'In {q.name}: {q.n_words} words')
    print(f'In {a.name}: {a.n_words} words')
    
    return q, a, pairs

In [25]:
q, a, pairs = prepare_data('test')

Reading test -------
Read 4041 sentence pairs
Counting words
Counted words:
In questions: 374 words
In answers: 394 words


In [26]:
print(random.choice(pairs))

['what date would you like to fly ?', 'he can spend more time with his grandchildren .']
