In [10]:
import pandas as pd
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.layers import Input, LSTM, Dense
from keras.models import Model

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
## Remove stage directions which are typically in parenthesis (like this)
def preprocess(text):
    
    ## Remove stage directions which are typically in parenthesis (like this)
    text = re.sub(r'\([^)]*\)', '', str(text))
    
    ## Remove capital letters
    text = text.lower()
    
    ## Remove contractions
    contractions = re.findall("[']", text)
    for i in contractions:
        text = text.replace(i,"")
    
    ## add spaces between last word and punctuation
    puncs = re.findall('[.!?,]', text)
    for i in puncs:
        text = text.replace(i," "+i[0]+" ")
        
    ## remove multiple spaces and replace with just one
    text =  re.sub(' +', ' ', text)
    
    ## remove trailing and leading spaces
    text = text.strip()
    
    return text

In [4]:
df['speaker'] = df['speaker'].apply(preprocess)
df['response'] = df['response'].apply(preprocess)

In [5]:
#shuffle the data
df = df.reindex(np.random.RandomState(seed=42).permutation(df.index))
df.head()

Unnamed: 0,speaker,response
6216,"hey guys , guess who i found at lax . my baby ...",excuse me . i object . you propose a guessing ...
1056,what’s that ?,i’m going to run around outside with a wet hea...
9053,"well , maybe it’s all for the best , you know ...",no it doesn’t . not unless the two doors are c...
346,maybe we can go up to napa valley . they’ve go...,"boo , wine ! but yay , trains . i’m in ."
5356,"now that penny and i are engaged , i thought w...","of course . she’s spent many nights here , and..."


In [6]:
## get x and y and convert to numpy
X_data = df['speaker'].to_numpy()
Y_data = df['response'].to_numpy()

## get the indices for the train, dev, and test sets
trainPercent = .8
devPercent = .1
testPercent = .1
trainEnd = (int)(len(X_data) * trainPercent)
devEnd =  trainEnd + (int)(len(X_data) * devPercent)

## get the train sets
X_train = X_data[0: trainEnd]
Y_train = Y_data[0: trainEnd]

## get the dev sets
X_dev = X_data[trainEnd + 1: devEnd]
Y_dev = Y_data[trainEnd + 1: devEnd]

## get the test sets
X_test = X_data[devEnd + 1:]
Y_test = X_data[devEnd + 1:]

## print the sizes
print('Train Size : ' + str(len(X_train)))
print('Dev Size : ' + str(len(X_dev)))
print('Test Size : ' + str(len(X_test)))

Train Size : 7796
Dev Size : 973
Test Size : 974


In [7]:
## create the tokenizer
tokenizer = Tokenizer()

## fit the tokenizer on the text
tokenizer.fit_on_texts(list(X_train) + list(Y_train))

## Unique tokens in the text
uniqTokens = len(tokenizer.get_config()['word_counts'])

## tokenize the train, dev, and test
X_train_tokens = tokenizer.texts_to_sequences(X_train)
Y_train_tokens = tokenizer.texts_to_sequences(Y_train)

X_dev_tokens = tokenizer.texts_to_sequences(X_dev)
Y_dev_tokens = tokenizer.texts_to_sequences(Y_dev)

X_test_tokens = tokenizer.texts_to_sequences(X_test)
Y_test_tokens = tokenizer.texts_to_sequences(Y_test)

## get the largest length of the sentences
maxLenX = max([len(s.split()) for s in X_data])
maxLenY = max([len(s.split()) for s in Y_data])

## padding input to the same size
X_train_pad = pad_sequences(X_train_tokens, maxlen = maxLenX)
X_dev_pad = pad_sequences(X_dev_tokens, maxlen = maxLenX)
X_train_pad = pad_sequences(X_test_tokens, maxlen = maxLenX)

Y_train_pad = pad_sequences(Y_train_tokens, maxlen = maxLenY)
Y_dev_pad = pad_sequences(Y_dev_tokens, maxlen = maxLenY)
Y_train_pad = pad_sequences(Y_test_tokens, maxlen = maxLenY)