In [1]:
import glob, os
import re
import bleach
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
TARGET_CHARACTER = "ROSS"
class Line: 
    def __init__ (self, speaker, line):
        self.speaker = speaker
        self.line = line

    def __str__ (self):
        return self.speaker + ": " + self.line

In [3]:
def process_line(line):
    step1 = line.strip()
    step2 = re.sub(r'\([^)]*\)',"", step1)
    return step2

In [4]:
def make_pairs(lines):
    scene_characters = {}
    lines_structured = []
    for l in lines:
        l_fields = l.split(":")
        if len(l_fields) == 1:
            continue
            
        character = l_fields[0].strip().upper()
        if character not in scene_characters:
            scene_characters[character] = 0
        scene_characters[character] += 1
        character_words = l_fields[1].strip()
        lines_structured.append(Line(character, character_words))
        
    if TARGET_CHARACTER not in scene_characters:
        return []
    
    line_pairs = []
    prev_line = lines_structured[0]
    for l in lines_structured[1:]:
        if l.speaker == TARGET_CHARACTER:
            line_pairs.append((prev_line, l))
        elif l.speaker == "ALL" and prev_line.speaker != TARGET_CHARACTER:
            l_new = Line(TARGET_CHARACTER, l.line)
            line_pairs.append((prev_line, l_new))
            
        elif TARGET_CHARACTER in l.speaker and prev_line.speaker != TARGET_CHARACTER:
            l_new = Line(TARGET_CHARACTER, l.line)
            line_pairs.append((prev_line, l_new))
        prev_line = l 
    
    return line_pairs
    

In [5]:
def pairs_to_string(pairs):
    ret = ""
    for (p1, p2) in pairs:
        ret += str(p1.line) + " <+++++> " + str(p2.line) + "\n"
    return ret 

In [6]:
def process_file(file_name):
    f = open(file_name, 'r', encoding = "ISO-8859-1")
    f_contents = f.read()
    f.close()
    scenes = re.compile("\[.*\]").split(f_contents)
    
    pairs_from_file = ""
    for scene in scenes:
        scene_strip = scene.strip()
        if scene_strip == "":
            continue 
        scene_lines = scene.split("\n")
        processed_lines = []
        for l in scene_lines:
            tmp = process_line(l)
            if tmp != "":
                processed_lines.append(tmp)
        line_pairs = make_pairs(processed_lines)
        pairs_txt = pairs_to_string(line_pairs)
        pairs_from_file += pairs_txt
    return pairs_from_file

In [7]:
scripts = os.listdir("scripts/")
all_data = open("Ross_responses.txt", 'w')
for s in scripts:
    print(s)
    file_data = process_file("scripts/" + s)
    if file_data == None:
        continue
    all_data.write(file_data)
all_data.close()

07outtakes.txt
0417.txt
0403.txt
0601.txt
0205.txt
0211.txt
0210.txt
0204.txt
0614.txt
0402.txt
0416.txt
0414.txt
0602.txt
0819.txt
0206.txt
0207.txt
0818.txt
0617.txt
0603.txt
0415.txt
0401.txt
0405.txt
0411.txt
0607.txt
0613.txt
0808.txt
0820.txt
0217.txt
0203.txt
0202.txt
0216.txt
0821.txt
0809.txt
0612.txt
0606.txt
0410.txt
0404.txt
0412.txt
0406.txt
0610.txt
0604.txt
0823.txt
0214.txt
0215.txt
0201.txt
0822.txt
0605.txt
0611.txt
0407.txt
0413.txt
0312.txt
0306.txt
0110.txt
0104.txt
0702.txt
0716.txt
0919.txt
0514.txt
0515.txt
0501.txt
0918.txt
0717.txt
0703.txt
0105.txt
0111.txt
0307.txt
0313.txt
0305.txt
0311.txt
0107.txt
0113.txt
0715.txt
0701.txt
0517.txt
0503.txt
0502.txt
0516.txt
0714.txt
0112.txt
0106.txt
0310.txt
0304.txt
0314.txt
0102.txt
0116.txt
0710.txt
0704.txt
0512.txt
0506.txt
0507.txt
0513.txt
0922.txt
0705.txt
0711.txt
0117.txt
0103.txt
0315.txt
0301.txt
1009.txt
0317.txt
0303.txt
0115.txt
0101.txt
0707.txt
0713.txt
0908.txt
0920.txt
0505.txt
0511.txt
0510.txt
0504