In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
import re

%matplotlib inline 

In [2]:
#loading the data
lines = []
with open('cornall/movie_lines.txt') as infile:
    for line in infile:
        lines.append(line)
        
convs = []
with open('cornall/movie_conversations.txt') as infile:
    for line in infile:
        convs.append(line)

print "Lines : ",len(lines)
print "Conversations : ",len(convs)

Lines :  304713
Conversations :  83097


In [3]:
line_ids = []
line_texts = []
for line in lines:
    line = line.split(' +++$+++ ')
    line_ids.append(line[0])
    line_texts.append(line[-1])

print len(line_ids),len(line_texts)
print line_ids[:5],line_texts[:5]

304713 304713
['L1045', 'L1044', 'L985', 'L984', 'L925'] ['They do not!\n', 'They do to!\n', 'I hope so.\n', 'She okay?\n', "Let's go.\n"]


In [4]:
#text processing
def text_processing(line):
        line = line.strip().lower()
        line = re.sub('i\'m','i am',line)
        line = re.sub('don\'t','do not',line)
        line = re.sub('won\'t','would not',line)
        line = re.sub('let\'s','let us',line)
        line = re.sub('he\'s','he is',line)
        line = re.sub('she\'s','she is',line)
        line = re.sub('you\'re','you are',line)
        line = re.sub('\'d',' would',line)
        line = re.sub('\'ve',' have',line)
        line = re.sub('it\'s','it is',line)
        line = re.sub('i\'ll','i will',line)
        line = re.sub('can\'t','can not',line)
        line = re.sub('that\'ll','that will',line)
        line = re.sub('\'nt',' not',line)
        line = re.sub('that\'s','that is',line)
        line = re.sub('what\'s','what is',line)
        line = re.sub('c\'mon','come on',line)
        line = re.sub('we\'ve','we have',line)
        line = re.sub('[^a-z\s]','',line)
        return line
line_text_proc = [text_processing(line) for line in line_texts]
line_text_proc[:10]

['they do not',
 'they do to',
 'i hope so',
 'she okay',
 'let us go',
 'wow',
 'okay  you are gonna need to learn how to lie',
 'no',
 'i am kidding  you know how sometimes you just become this persona  and you do not know how to quit',
 'like my fear of wearing pastels']

In [5]:
#processing conversations
convs_ids = []
for line in convs:
    line = line.strip().split(' +++$+++ ')[-1]
    line = re.sub('[^L0-9]',' ',line).split()
    convs_ids.append(line)
convs_ids[:5]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208']]

In [6]:
#create question and answer data
id2line = {}
for i in range(len(line_ids)):
    id2line[line_ids[i]] = line_text_proc[i]
print len(id2line)
    
questions = []
answers = []
for conv in convs_ids:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])

print len(questions),len(answers)
for i in range(5):
    print questions[i]
    print answers[i]


304713
221616 221616
can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
well i thought we would start with pronunciation if that is okay with you
well i thought we would start with pronunciation if that is okay with you
not the hacking and gagging and spitting part  please
not the hacking and gagging and spitting part  please
okay then how bout we try out some french cuisine  saturday  night
you are asking me out  that is so cute what is your name again
forget it
no no it is my fault  we didnt have a proper introduction 
cameron


In [7]:
#remove questions and answers which are less than 2 words and greater than 20 words
min_len = 2
max_len = 20

questions_temp = []
answers_temp = []
for i in range(len(questions)):
    if len(questions[i].split()) >= min_len and len(questions[i].split())<= max_len:
        questions_temp.append(questions[i])
        answers_temp.append(answers[i])
        
questions = []
answers = []
for i in range(len(answers_temp)):
    if len(answers_temp[i].split()) >= min_len and len(answers_temp[i].split()) <= max_len:
        questions.append(questions_temp[i])
        answers.append(answers_temp[i])
        
print len(questions),len(answers)

139121 139121


In [8]:
#create a dictionary and take the words with at least 5 occurances
freq_dist = nltk.FreqDist(' '.join(questions).split())
word_list = sorted([(value,key) for key,value in freq_dist.items() if value >= 5],reverse = True)
#map each word to a unique integer
word2int = {}
for value,key in word_list:
    word2int[key] = len(word2int)

#add unique tokens to dictionary
codes = ['<PAD>','<UNK>','<GO>','<EOS>']
for code in codes:
    word2int[code] = len(word2int)
    
int2word = dict(zip(word2int.values(),word2int.keys()))
print len(word2int),len(int2word)

8637 8637


In [9]:
#convert text to integers
questions_int = []
answers_int = []

for i in range(len(questions)):
    temp = []
    for word in questions[i].split():
        if word2int.get(word,-1) == -1:
            temp.append(word2int['<UNK>'])
        else:
            temp.append(word2int[word])
    questions_int.append(temp)
    temp = []
    for word in answers[i].split():
        if word2int.get(word,-1) == -1:
            temp.append(word2int['<UNK>'])
        else:
            temp.append(word2int[word])
    temp.append(word2int['<EOS>'])
    answers_int.append(temp)
for i in range(2):
    print questions_int[i],[int2word[value] for value in questions_int[i]]
    print "----------------------"
    print answers_int[i],[int2word[value] for value in answers_int[i]]
    print "-----------------------"

[42, 1, 121, 20, 30, 332, 31, 8634, 52, 10, 3, 94, 31, 0] ['well', 'i', 'thought', 'we', 'would', 'start', 'with', '<UNK>', 'if', 'that', 'is', 'okay', 'with', 'you']
----------------------
[7, 2, 8634, 15, 8634, 15, 7661, 416, 139, 8636] ['not', 'the', '<UNK>', 'and', '<UNK>', 'and', 'spitting', 'part', 'please', '<EOS>']
-----------------------
[7, 2, 8634, 15, 8634, 15, 7661, 416, 139] ['not', 'the', '<UNK>', 'and', '<UNK>', 'and', 'spitting', 'part', 'please']
----------------------
[94, 95, 35, 717, 20, 250, 46, 78, 1067, 8634, 1509, 149, 8636] ['okay', 'then', 'how', 'bout', 'we', 'try', 'out', 'some', 'french', '<UNK>', 'saturday', 'night', '<EOS>']
-----------------------
