In [2]:
import glob, os
import re
import bleach
import random
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
TARGET_CHARACTER = "ROSS"
class Line: 
    def __init__ (self, speaker, line):
        self.speaker = speaker
        self.line = line

    def __str__ (self):
        return self.speaker + ": " + self.line

In [3]:
def process_line(line):
    step1 = line.strip()
    step2 = re.sub(r'\([^)]*\)',"", step1)
    return step2

In [4]:
def shorten_pair(pair):
    (query, reply) = pair
    query_sentences = sent_tokenize(query.line)
    reply_sentences = sent_tokenize(reply.line)

    new_input = ""
    num_input = 0
    for s in reversed(query_sentences):
        s_tokens = word_tokenize(s)
        if num_input + len(s_tokens) <= 30:
            new_input = s + " " + new_input
            num_input += len(s_tokens)

        else:
            break

    new_input = new_input.strip()

    new_reply = ""
    num_reply = 0
    for s in reply_sentences:
        s_tokens = word_tokenize(s)
        if num_reply + len(s_tokens) <= 30:
            new_reply = new_reply + " " + s
            num_reply += len(s_tokens)
        else:
            break

    new_reply = new_reply.strip()
    
    p1 = Line(query.speaker, new_input)
    p2 = Line(reply.speaker, new_reply)
    return (p1,p2)

In [5]:
def make_pairs(lines):
    scene_characters = {}
    lines_structured = []
    for l in lines:
        l_fields = l.split(":")
        if len(l_fields) == 1:
            continue
            
        character = l_fields[0].strip().upper()
        if character not in scene_characters:
            scene_characters[character] = 0
        scene_characters[character] += 1
        character_words = l_fields[1].strip()
        lines_structured.append(Line(character, character_words))
        
    if TARGET_CHARACTER not in scene_characters:
        return []
    
    line_pairs = []
    prev_line = lines_structured[0]
    for l in lines_structured[1:]:
        if l.speaker == TARGET_CHARACTER:
            truncated_pair = shorten_pair((prev_line, l))
            line_pairs.append(truncated_pair)
        elif l.speaker == "ALL" and prev_line.speaker != TARGET_CHARACTER:
            l_new = Line(TARGET_CHARACTER, l.line)
            truncated_pair = shorten_pair((prev_line, l_new))
            line_pairs.append(truncated_pair)
            
        elif TARGET_CHARACTER in l.speaker and prev_line.speaker != TARGET_CHARACTER:
            l_new = Line(TARGET_CHARACTER, l.line)
            truncated_pair = shorten_pair((prev_line, l_new))
            line_pairs.append(truncated_pair)
        prev_line = l 
    
    return line_pairs
    

In [6]:
def pairs_to_string(pairs):
    ret = ""
    for (p1, p2) in pairs:
        ret += str(p1.line) + " <+++++> " + str(p2.line) + "\n"
    return ret 

In [7]:
def process_file(file_name):
    f = open(file_name, 'r', encoding = "ISO-8859-1")
    f_contents = f.read()
    f.close()
    scenes = re.compile("\[.*\]").split(f_contents)
    
    pairs_from_file = ""
    for scene in scenes:
        scene_strip = scene.strip()
        if scene_strip == "":
            continue
        scene_lines = scene.split("\n")
        processed_lines = []
        for l in scene_lines:
            tmp = process_line(l)
            if tmp != "":
                processed_lines.append(tmp)
        line_pairs = make_pairs(processed_lines)
        pairs_txt = pairs_to_string(line_pairs)
        pairs_from_file += pairs_txt
    return pairs_from_file

In [9]:
scripts = os.listdir("scripts/")
all_data = open("Ross_all.txt", 'w')
for s in scripts:
    file_data = process_file("scripts/" + s)
    if file_data == None:
        continue
    all_data.write(file_data)
all_data.close()

# Partition data

In [3]:
def clean_line(text):
    text = re.sub('\x85','...', text)
    text = re.sub('\x91','\'', text)
    text = re.sub('\x92','\'', text)
    text = re.sub('\x96', '-', text)
    text = re.sub('\x97', '-', text)
    text = re.sub('Â', '', text)
    return text

In [5]:
def write_to_file(q_file, r_file, data):
    for d in data:
        d_clean = clean_line(d)
        pair = d_clean.split(" <+++++> ")
        q_file.write(pair[0]+"\n")
        r_file.write(pair[1]+"\n")

def split_in_out(file_name):
    f = open(file_name+"_all.txt", 'r', encoding = "ISO-8859-1")
    
    data = f.read().split("\n")[:-1]
    random.Random(1776).shuffle(data)
    
    s1 = data[:45000]
    s2 = data[45000:80000]
    s3 = data[80000:]
    
    end_point = "../data/" + file_name
    s1_query = open(end_point+"_test_query.en",'w')
    s1_reply = open(end_point+"_test_reply.en",'w')
    write_to_file(s1_query, s1_reply, s1)
    s1_query.close()
    s1_reply.close()
    
    s2_query = open(end_point+"_valid_query.en",'w')
    s2_reply = open(end_point+"_valid_reply.en",'w')
    write_to_file(s2_query, s2_reply, s2)
    s2_query.close()
    s2_reply.close()
    
    s3_query = open(end_point+"_train_query.en",'w')
    s3_reply = open(end_point+"_train_reply.en",'w')
    write_to_file(s3_query, s3_reply, s3)
    s3_query.close()
    s3_reply.close()
    
        
split_in_out("Cornell")

In [15]:
def divide_data():
    cat = "Cornell"
    f = open(cat+"_all.txt", 'r', encoding = "ISO-8859-1")
    data = f.read().split("\n")[:-1]
    print(len(data))
    random.Random(1776).shuffle(data)
    #8719 pieces of Ross data, (test, valid, train) 1000, 1000, 6719
    #221605 pieces of Cornell data, (test, valid, train) 44321, 35456(79777), rest
    test_data = data[6719:44321]
    valid_data = data[44321:79777]
    train_1_data = data[79777:]
    train_2_data = data[:6719]

    
    test_file = open("../data/" + cat + "_test.txt", 'w')
    for d in test_data:
        d = clean_line(d)
        test_file.write(d+"\n")
    test_file.close()
    
    valid_file = open("../data/"+ cat+"_valid.txt", 'w')
    for d in valid_data:
        d = clean_line(d)
        valid_file.write(d+"\n")
    valid_file.close()
    
    train_file = open("../data/"+ cat+ "_1_train.txt", 'w')
    for d in train_1_data:
        d = clean_line(d)
        train_file.write(d+"\n")
    train_file.close()
    
    train_file = open("../data/"+ cat+ "_2_train.txt", 'w')
    for d in train_2_data:
        d = clean_line(d)
        train_file.write(d+"\n")
    train_file.close()

divide_data()

221605


In [5]:
import sys
sys.path.append('~/multimodal_keras_wrapper')


In [4]:
os.getcwd()

'/Users/Alex/RossBot/data_raw'

In [11]:
from multimodal_keras_wrapper.dataset import Dataset


ModuleNotFoundError: No module named 'multimodal_keras_wrapper'