# Import

In [1]:
from collections import Counter
import os
import numpy as np

In [2]:
stimulus = 'hidden_figures'

# Helper functions

## Get character labels and map them to indices

In [3]:
def gather_character_info(subtitles_with_characters_path):
    character_list = []
    with open(subtitles_with_characters_path, 'r') as f:
        for line in f.readlines():
            line = line.replace('\n','').replace('\t','')
            if line.find(' - ') != -1:
                character = line[:line.find(' - ')]
                character_list.append(character)
    
    character_counts = Counter(character_list)
    character2idx = {}
    for i, character in enumerate(character_counts.keys()):
        character2idx[character] = i
    return character2idx, character_counts

## Create word-level one-hot character features 

In [7]:
def generate_word_level_character_features(seg_transcript_path, character2idx, seg_out_path):
    num_characters = len(character2idx)
    words, timestamps = [], []
    char_feats = char_feats = np.empty((0,num_characters), int)
    with open(seg_transcript_path, 'r') as f:
        for line in f.readlines():
            line = line.replace('\n','').replace('\t','')
            if line == '':
                continue

            t, c, w = line.split(":")[0].strip(), line.split(":")[1].strip(), ':'.join(line.split(":")[2:]).strip()

            i = character2idx[c]
            curr_feat = np.zeros((1,num_characters))
            curr_feat[0][i] = 1
            char_feats = np.append(char_feats, curr_feat, axis=0)

            words.append(w)
            timestamps.append(float(t))
        timestamps = np.array(timestamps)
    
    np.save(seg_out_path, {'char_feats': char_feats, 'words': words, 'timestamps': timestamps}, allow_pickle=True)
    
    return char_feats, words, timestamps

# Testing this code with an example

In [None]:
subtitles_with_characters_path = 'subtitles_with_characters/{}_subtitles.txt'.format(stimulus)
movies_transcripts_with_characters_folder = 'movies_transcripts_with_characters/{}/'.format(stimulus)
out_folder = 'movies_word_level_character_embeddings/{}'.format(stimulus)

In [6]:
character2idx, character_counts = gather_character_info(subtitles_with_characters_path)
print("character2idx")
print(character2idx)
print("\n")
print("character_counts")
print(character_counts)

character2idx
{'OTHER': 0, 'KATHERINE': 1, 'DOROTHY': 2, 'MARY': 3, 'AL HARRISON': 4, 'PAUL STAFFORD': 5, 'RUTH': 6, 'VIVIAN MITCHELL': 7, 'JIM JOHNSON': 8, 'JOHN GLENN': 9, 'MISSION CONTROL COMMANDER': 10}


character_counts
Counter({'OTHER': 375, 'KATHERINE': 320, 'AL HARRISON': 299, 'DOROTHY': 175, 'MARY': 158, 'PAUL STAFFORD': 95, 'VIVIAN MITCHELL': 75, 'JOHN GLENN': 52, 'JIM JOHNSON': 40, 'RUTH': 34, 'MISSION CONTROL COMMANDER': 30})


In [8]:
num_segments = 12
for seg in range(1, num_segments+1):
    segment_transcript_file = '{}_seg{}_mono.txt'.format(stimulus, str(seg).zfill(2))
    segment_transcript_path = os.path.join(movies_transcripts_with_characters_folder, segment_transcript_file)
    out_file = '{}_seg{}.npy'.format(stimulus, str(seg).zfill(2))
    out_path = os.path.join(out_folder, out_file)
    generate_word_level_character_features(segment_transcript_path, character2idx, out_path)