In [1]:
# https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [33]:
# importing the libraries
import pandas as pd
import tensorflow as tf
import re
import time # calculating training time

In [34]:
lines = open("movie_lines.txt", encoding="utf-8", errors="ignore").read().split('\n')
conversations = open("movie_conversations.txt", encoding="utf-8", errors="ignore").read().split('\n')

In [35]:
# a dictionary that will map each line with its id
id2line = {}

In [43]:
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [57]:
# creating a list of all the conversatation
conversations_ids = []

#note last row is empty [:-1], we will not include this
for conversation in conversations[:-1]:
                                                # last column [-1]  : [1,-1] - remove first([) and last char(])
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    conversations_ids.append(_conversation.split(","))
    
    

In [64]:
# Now, create two array one for storing all the question and for storing answer
questions = []
answers = []
for conversatation in conversations_ids:
    # conversatation will conatin a list of question and answer
    for i in range(len(conversatation) - 1):
        questions.append(id2line[conversatation[i]])
        answers.append(id2line[conversatation[i+1]])

In [65]:
#Let's clean the data by regular expression

def clean_text(text):
    text = text.lower() # all in lower case
    text = re.sub(r"I'm","i am", text)
    text = re.sub(r"he's","he is", text)
    text = re.sub(r"she's","she is", text)
    text = re.sub(r"i'd","i would", text)
    text = re.sub(r"that's","that is", text)
    text = re.sub(r"what's","what is", text)
    text = re.sub(r"\'ll"," will", text)
    text = re.sub(r"\'re"," are", text)
    text = re.sub(r"\'d"," would", text)
    text = re.sub(r"can't","can not", text)
    text = re.sub(r"won't","would not", text)
    # re.sub(r"[]","", text) to remove special character, pass all the character in the list []
    text = re.sub(r"[~`[]{}@#$%^&*()_-+=?/.,><:;|\"]","", text)
    return text
    

In [66]:
# cleaning the question and answers
clean_questions = []
clean_answers = []

for question in questions:
    clean_questions.append(clean_text(question))
    
for answer in answers:
    clean_answers.append(clean_text(answer))


In [72]:
# will create a dictionary that will map a word with its frequency
word2count = {} # it's a dictionary
for question in clean_questions:
        for word in question.split():
            if word not in word2count:
                word2count[word] = 1
            else:
                word2count[word] += 1
                
for answer in clean_answers:
        for word in question.split():
            if word not in word2count:
                word2count[word] = 1
            else:
                word2count[word] += 1                


In [82]:
# creating 2 dictionaries , that map each words with unique number based on occurance

threshold = 20 # we want atleast a word to be occur more that 20 time and will assign a uniqe value
questionswords2int = {}
word_number = 0

for word, count in word2count.items():
    if count >= threshold :
        questionswords2int[word] = word_number  # assigning a unique number to the word
        word_number += 1
        
answerwords2int = {}
word_number = 0

for word, count in word2count.items():
    if count >= threshold :
        answerwords2int[word] = word_number  # assigning a unique number to the word
        word_number += 1


In [93]:







# Now adding the token seq2seq
# add them to two dictinary

# SOS -> Start of the sentance
# EOS -> End of Sentance
# OUT
# PAD
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>'] # order is important

for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerwords2int[token] = len(answerwords2int) + 1    


In [88]:
#creating the inverse of dictionary
answerint2words = {w_i : w for w, w_i in answerwords2int.items() }

In [94]:
# Adding EOS at the end of each answers

for i in range(len(clean_answers)-1):
    clean_answers[i] += '<EOS>'
    

In [101]:
# transalating each question and answer into integers
# replacing all the words which are filtered out with <OUT>

question_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    
    question_into_int.append(ints)

answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in question.split():
        if word not in answerwords2int:
            ints.append(answerwords2int['<OUT>'])
        else:
            ints.append(answerwords2int[word])
    
    answers_into_int.append(ints)

In [105]:
# Sorting questions and answer based on the length
# to speed up the training
sorted_clean_questions = []
sorted_clean_answers = []
# we dont want q/a to be too long approx 25 chacrter long will accept
for length in range(1, 25+1):
    # we want index and question from the list
    for i in enumerate(question_into_int):  # {index, question } -> {0,1}
        #check if the length of question is one -> i[1] will be question and i[0] will be the index
        if len(i[1]) == length:
            sorted_clean_questions.append(question_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])
        
        
    





