In [None]:
import string
import numpy as np
from collections import Counter
from distutils.version import LooseVersion
import warnings


def load_file(filename):
    with open('transcripts.txt',"r") as transcript_file:
        transcript = transcript_file.read()
    transcript_file.close()
    return transcript

def clean_doc(file):
    file = file.replace("\n,", "[").replace('",,' , "\n\n\n\n").replace(":" , "]:").replace("[[", "[").replace("," , "").replace("  " , " ").replace("\n[\n", "").replace("\t", " ").replace("'", "")
    file = file.replace('"', "")
    tokens = file.split()
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return file

def punct_replace(file):
    file = file.replace('\n', " NEWLINE ").replace( '!', " EXCLAMATIONMARK").replace("$", "DOLLARSIGN ")
    file = file.replace(".", " FULLSTOP ").replace("/", " FORWARDSLASH ").replace(":", " COLON")
    file = file.replace("?", " QUESTIONMARK").replace("[", "LEFTSQUAREBRACKET ").replace("]", " RIGHTSQUAREBRACKET")
    return file

def create_lookup_tables(text):
    word_counts = Counter(text)
    
    word_count = {k:word_counts[k] for k in word_counts if word_counts[k] > 10}
    ignored_words = {k:word_counts[k] for k in word_counts if word_counts[k] < 10}
    
    sorted_vocab = sorted(word_count, key=word_count.get, reverse=True)
    sorted_ignored = sorted(ignored_words, key=ignored_words.get)
        
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    return vocab_to_int, int_to_vocab, sorted_ignored
    

def sentences(sorted_ignored, all_words):
    SEQUENCE_LENGTH = 10
    sentences = []
    next_words = []
    ignored = 0
    for i in range(0, len(all_words) - SEQUENCE_LENGTH, 1):
        if len(set(all_words[i:i+SEQUENCE_LENGTH+1]).intersection(sorted_ignored)) == 0:
            sentences.append(all_words[i:i+SEQUENCE_LENGTH])
            next_words.append(all_words[i+SEQUENCE_LENGTH])
        else:
            ignored = ignored+1
    print('Ignored sequences: ', ignored)
    print('Remaining sequences: ', len(sentences))
    return sentences, next_words


in_filename = 'transcripts.txt'
in_file = load_file(in_filename)

cleaned_file = clean_doc(in_file)

punct_replaced = punct_replace(cleaned_file)
punct_replaced = punct_replaced.lower()
punct_replaced = punct_replaced.split()

vocab_to_int, int_to_vocab, sorted_ignored = create_lookup_tables(punct_replaced)
sentences, next_words = sentences(sorted_ignored, punct_replaced)



#corpus = sorted(list(set(clean_file)))
#print(corpus)

#print('Dataset Stats')
#print('Roughly the number of unique words: {}'.format(len({word: None for word in cleaned_file.split()})))
#scenes = cleaned_file.split('\n\n\n\n\n\n')
#print('Number of episodes: {}'.format(len(scenes)))
#sentence_count_scene = [scene.count('\n') for scene in scenes]
#print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

#sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
#print('Number of lines: {}'.format(len(sentences)))
#word_count_sentence = [len(sentence.split()) for sentence in sentences]
#print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

#print()
#print('The sentences {} to {}:'.format(*view_sentence_range))
#print('\n'.join(cleaned_file.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))


In [5]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=20):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return x_train, y_train, x_test, y_test

sentences, next_words, sentences_test, next_words_test = shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 720758
Size of test set = 180190


In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Bidirectional, Dropout
from keras.optimizers import RMSprop
import keras

def get_model(dropout=0.2):
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(10, len(punct_replaced))))
    if dropout > 0:
        model.add(Dropout(dropout))
    
    model.add(Dense(len(punct_replaced)))
    model.add(Activation('softmax'))
    return model
    
#x = np.zeros((len(sentences), 10, len(punct_replaced)), dtype=np.bool)
#y = np.zeros((len(sentences), len(punct_replaced)), dtype=np.bool)

word_indices=dict((c, i) for i, c in enumerate(punct_replaced))
indicied_word=dict((i, c) for i, c in enumerate(punct_replaced))


def generator(sentences, next_words, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, 10, len(punct_replaced)), dtype=np.bool)
        y = np.zeros((batch_size, len(punct_replaced)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentences[index]):
                x[i, t, word_indices[w]] = 1
                y[i, word_indices[next_words[index]]] = 1
                
                index = index + 1
                if index == len(sentences):
                    index = 0
            yield x, y
model = get_model()

Using TensorFlow backend.


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(generator(sentences, next_words, 32), 
                    steps_per_epoch=int(len(sentences)/32) + 1, 
                                        epochs=100, 
                                       validation_data=generator(sentences_test, next_words_test, 32),
                                       validation_steps=int(len(sentences_test)/32)+1)

Epoch 1/100
