In [119]:
import torch
import torch.nn as nn #Neural Networks package
from torch import optim #Optimisers
import torch.nn.functional as F 
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [120]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

### Part 1: Preprocessing

In [121]:
lines_filepath = os.path.join('cornell movie-dialogs corpus', 'movie_lines.txt')
conv_filepath = os.path.join('cornell movie-dialogs corpus', 'movie_conversations.txt')

In [122]:
#Visualise some lines
with open(lines_filepath ,'r', errors="ignore") as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [123]:
#Splits each line of the file into a dictionary of fields(lineID, characterID, movieID, character, text)
lines_fields = ['lineID', 'characterID', 'movieID', 'character', 'text']
lines = {}
with open(lines_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(' +++$+++ ')
        #Extract fields
        lineObj = {}
        for i, field in enumerate(lines_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj

In [124]:
#Groups fields of lines from 'LoadLines' into conversations based on "movie_conversations.txt"
conv_fields = ['characterID', 'character2ID', 'movieID', 'utteranceIDs']
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(' +++$+++ ')
        #Extract fields
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] = values[i]
        #Convert string result from split to list, since convObj['utteranceIDs'] == "['id123', 'id123213', ...]"
        lineIds = eval(convObj['utteranceIDs'])
        #Reassemble lines
        convObj['lines'] = []
        for lineId in lineIds:
            convObj['lines'].append(lines[lineId])
        conversations.append(convObj)

In [125]:
#Extract pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
    #Iterate over all the lines of the conversation
    for i in range(len(conversation["lines"]) -1):
        inputLine = conversation['lines'][i]['text'].strip()
        targetLine = conversation['lines'][i+1]['text'].strip()
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])

In [126]:
#Define path to new file
datafile = os.path.join('cornell movie-dialogs corpus', 'formatted_movie_lines.txt')
delimiter = '\t'
#Unescape the delimiter
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

#Write new csv file
print('\nWriting newly formatted file...')
with open(datafile, "w", encoding="utf-8") as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print('Done writing to file')



Writing newly formatted file...
Done writing to file
