Transcripts are formatted drastically differently from screenplay:

- Lines are attributed only as a new character is introduced, and subsequent lines do not explicity identify the speaking character.
- Dialog lines are split into multiple lines.
- Dialog lines randomly end in middle of lines.
- Action/Direction is mixed into dialog, need to regex away anything between []
- There's music, need to regex away anything between ♪♪

In [1]:
import json
import re

In [2]:
def open_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [3]:
mmm_scripts = open_json('./data/maisel_scripts.json')

In [4]:
def split_title(title):
    [season, episode] = map(lambda n: int(n), title.split(' - ')[0].split('x'))
    return season, episode

In [5]:
for s, script in enumerate(mmm_scripts):
    title = script['title']
    [x, y] = map(lambda n: int(n), title.split(' - ')[0].split('x'))
    print(s, x, y, flush=True)

0 1 1
1 1 2
2 1 3
3 1 4
4 1 5
5 1 6
6 1 7
7 1 8
8 2 1
9 2 2
10 2 3
11 2 4
12 2 5
13 2 6
14 2 7
15 2 8
16 2 9
17 2 10
18 3 1
19 3 2
20 3 3
21 3 4
22 3 5
23 3 6
24 3 7
25 3 8
26 4 1
27 4 2
28 4 3
29 4 4
30 4 5
31 4 6
32 4 7
33 4 8


In [6]:
def munge_script(script, count = False):
    curr_line = ""
    clean_script = []
    n = 0
    for line in script:
        strip_line = line.strip()
        clean_lines = strip_line.split('\n')
        for clean_line in clean_lines:
            # Remove music
            re_music = re.compile(r'♪.*♪')
            clean_line = re.sub(re_music, '', clean_line)

            # Remove action direction
            re_action = re.compile(r'\[.*?\]')
            clean_line = re.sub(re_action, '', clean_line)

            # Iteratively Remove dashes from speaker changes
            re_dash = re.compile(r'^ *- *')
            while re.match(re_dash, clean_line):
                clean_line = re.sub(re_dash, '', clean_line)

            # Remove speaker names
            re_name = re.compile(r'^\w+: *')
            clean_line = re.sub(re_name, '', clean_line)

            # Build out final dialog line
            curr_line = curr_line + clean_line
            n = n + len(curr_line.split(' '))
            if len(clean_line) > 0 and clean_line[-1] in ['.', '!', '?']:
                clean_script.append(curr_line)
                curr_line = ""
    if count:
        res = n
    else:
        res = clean_script
    return res

In [9]:
clean_scripts = []
for script in mmm_scripts:
    [season, episode] = split_title(script['title'])
    clean = munge_script(script['script'], True)
    clean_scripts.append({
        'season': season,
        'episode': episode,
        'count': clean
    })

In [10]:
clean_scripts

[{'season': 1, 'episode': 1, 'count': 9465},
 {'season': 1, 'episode': 2, 'count': 9532},
 {'season': 1, 'episode': 3, 'count': 7816},
 {'season': 1, 'episode': 4, 'count': 8723},
 {'season': 1, 'episode': 5, 'count': 13626},
 {'season': 1, 'episode': 6, 'count': 8069},
 {'season': 1, 'episode': 7, 'count': 7881},
 {'season': 1, 'episode': 8, 'count': 8793},
 {'season': 2, 'episode': 1, 'count': 8349},
 {'season': 2, 'episode': 2, 'count': 8813},
 {'season': 2, 'episode': 3, 'count': 7925},
 {'season': 2, 'episode': 4, 'count': 7591},
 {'season': 2, 'episode': 5, 'count': 8965},
 {'season': 2, 'episode': 6, 'count': 6450},
 {'season': 2, 'episode': 7, 'count': 8633},
 {'season': 2, 'episode': 8, 'count': 6868},
 {'season': 2, 'episode': 9, 'count': 7899},
 {'season': 2, 'episode': 10, 'count': 8618},
 {'season': 3, 'episode': 1, 'count': 8961},
 {'season': 3, 'episode': 2, 'count': 8307},
 {'season': 3, 'episode': 3, 'count': 8381},
 {'season': 3, 'episode': 4, 'count': 8010},
 {'seaso

In [12]:
# Combine with runtimes
mmm_runtimes = open_json('./data/maisel_runtimes.json')

In [14]:
def searchFor(season, episode, data):
    for d in data:
        if d['episode'] == episode and d['season'] == season:
            return d
    return []

In [16]:
for script in clean_scripts:
    run = searchFor(script['season'], script['episode'], mmm_runtimes)
    script['runtime'] = run['runtime']

In [18]:
# Write output
with open('./output/mmm_data.json', 'w+') as outfile:
    json.dump(clean_scripts, outfile)