In [1]:
import numpy as np
import json
import unicodedata
import re

from numpy import savez_compressed
import pickle

import tensorflow as tf
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                 if unicodedata.category(c) != 'Mn')

def preprocess_text(sentence, is_summary) :
    '''
    parameters : sentence - string
    Adds ' ' before punctuations, replace numbers or special characters with ' '.
    adds the '<BOS>' (Beginning of Sentence) tag to each sentence
    returns : de-noised/pre-processed sentence
    '''
    
    sentence = unicode_to_ascii(sentence.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    sentence = re.sub(r"[^a-zA-Z?.!,¿']+", " ", sentence)
    
    if is_summary :
        sentence = '[BEGIN] ' + sentence
    else :
        sentence = '[BOS] ' + sentence
        
    return sentence

In [4]:
with open('./split_0.jsonl', 'r') as json_file:
    json_list = list(json_file)

'''
results is a list of dictionaries with keys -> 'id', 'meeting', 'summary'. Meeting consists of 'speaker' and 'utt'
turns is a 2d list consisting of speakers in each meeting -> 0, 1, 2 etc
meetings is a 2d list consisting of corresponding turns' sentences 
summaries is a 2d list consisting of corresponding meeting's summary
'''
results = []
turns = []
meetings = []
summaries = []

NUM_TURNS = 70

for json_str in json_list:
    results.append(json.loads(json_str))

for result in results :
    turns_temp = []
    meetings_temp = []
    for obj in result['meeting'][:NUM_TURNS] :
        turns_temp.append(ord(obj['speaker']) - 65) # convert letter to number i.e 'A' -> 0, 'B' -> 1
#         turns_temp.append(obj['role'])
        sentence = ' '.join(obj['utt']['word'])
        meetings_temp.append(preprocess_text(sentence, is_summary=False))

    turns.append(turns_temp)
    meetings.append(meetings_temp)
    
    summary = ' '.join(result['summary'])
    summaries.append(preprocess_text(summary, is_summary=True))

In [5]:
all_text = []
for meeting in meetings :
    all_text += meeting
all_text += summaries

In [6]:
def make_tokenizer(sentences) :
    '''
    parameters : sentences - list of sentences
    creates a vocabulary of words based on the list of inputted sentences using the Tokenizer object
    returns : tokenizer - Tokenizer object
    '''
    
    tokenizer = Tokenizer(filters='', oov_token='[UNKNOWN]')
#     tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer

def encode_texts(sentences, tokenizer) :
    '''
    parameters : sentences - list of sentences
                 tokenizer - Tokenizer object initialized using dataset
    encodes the text sequences in the dataset by mapping the index of the word in the vocabulary to each word
    in the dataset
    returns : encoded_docs - list of encoded sentences
    '''
    
    encoded_docs = tokenizer.texts_to_sequences(sentences)
    return encoded_docs

In [7]:
tokenizer = make_tokenizer(all_text)

padded_meetings = []
for meeting in meetings :
    padded_meetings.append(pad_sequences(encode_texts(meeting, tokenizer), maxlen=100, padding='post'))
    
padded_turns = pad_sequences(turns, padding='post')

padded_summaries = pad_sequences(encode_texts(summaries, tokenizer), maxlen=100, padding='post') 

In [8]:
MAX_LENGTH_BIN = 3
role_vector = []

max_turn_number = np.max(turns)

for num in range(max_turn_number + 1) :
    binary_list = list(bin(num).replace('0b', '').zfill(MAX_LENGTH_BIN))
    role_vector.append(np.array(list(map(int, binary_list))))

In [9]:
# (num_meetings, num_turns, seq_len)
savez_compressed('meetings.npz', np.int64(np.array(padded_meetings)))
# (num_meetings, num_turns)
savez_compressed('turns.npz', np.int64(np.array(padded_turns)))
# (num_meetings, summary_len)
savez_compressed('summary.npz', np.int64(np.array(padded_summaries)))
# (num_roles, 3)
savez_compressed('role_vector.npz', np.int64(np.array(role_vector)))

In [10]:
with open('tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
#data
meetings = np.load('meetings.npz')['arr_0'] # (num_meetings, num_turns, seq_len)
turns = np.load('turns.npz')['arr_0'] # (num_meetings, num_turns)
summary = np.load('summary.npz')['arr_0'] # (num_meetings, summary_len)
role_vector = np.load('role_vector.npz')['arr_0'] # (num_meetings, num_turns, 3)

with open('tokenizer.pickle', 'rb') as file:
    tokenizer = pickle.load(file)

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((meetings, summary, turns))
dataset = dataset.batch(1)

In [13]:
meetings1, summary1, turns1 = next(iter(dataset))
meetings1.shape, summary1.shape, turns1.shape, role_vector.shape

(TensorShape([1, 70, 100]),
 TensorShape([1, 100]),
 TensorShape([1, 70]),
 (4, 3))