In [1]:
from collections import Counter
import os
import numpy as np

In [2]:
stimulus = 'hidden_figures'

# Get Character Labels

NOTE: Before running the following cells, use **Screenplay Character Labeling - Using Subtitles File.ipynb** to create subtitles with characters (.txt) file from subtitles (.srt) file.

In [3]:
def gather_character_info(subtitles_with_characters_path):
    char_list = []
    with open(subtitles_with_characters_path, 'r') as f:
        for line in f.readlines():
            line = line.replace('\n','').replace('\t','')
            if line.find(' - ') != -1:
                char = line[:line.find(' - ')]
                char_list.append(char)
    
    char_counts = Counter(char_list)
    char2idx = {}
    for i, char in enumerate(char_counts.keys()):
        char2idx[char] = i
    return char2idx, char_counts

In [4]:
subtitles_with_characters_path = 'subtitles_with_characters/{}_subtitles.txt'.format(stimulus)

In [5]:
char2idx, char_counts = gather_character_info(subtitles_with_characters_path)

In [26]:
print("char2idx")
print(char2idx)
print("\n")
print("char_counts")
print(char_counts)

char2idx
{'OTHER': 0, 'KATHERINE': 1, 'DOROTHY': 2, 'MARY': 3, 'AL HARRISON': 4, 'PAUL STAFFORD': 5, 'RUTH': 6, 'VIVIAN MITCHELL': 7, 'JIM JOHNSON': 8, 'JOHN GLENN': 9, 'MISSION CONTROL COMMANDER': 10}


char_counts
Counter({'OTHER': 375, 'KATHERINE': 320, 'AL HARRISON': 299, 'DOROTHY': 175, 'MARY': 158, 'PAUL STAFFORD': 95, 'VIVIAN MITCHELL': 75, 'JOHN GLENN': 52, 'JIM JOHNSON': 40, 'RUTH': 34, 'MISSION CONTROL COMMANDER': 30})


# Word-level one-hot character features 

In [14]:
def generate_word_level_character_features(timed_transcript_with_characters_folder, num_segments, char2idx):
    global stimulus
    
    num_chars = len(char2idx)
    word_timestamps = []
    words = []
    char_feats = np.empty((0,num_chars), int)
    for seg in range(1, num_segments+1):
        segment_transcript_file = '{}_seg{}_mono.txt'.format(stimulus, str(seg).zfill(2))
        timed_transcript_with_characters = os.path.join(timed_transcript_with_characters_folder, segment_transcript_file)
        with open(timed_transcript_with_characters, 'r') as f:
            for line in f.readlines():
                line = line.replace('\n','').replace('\t','')
                if line == '':
                    continue
                    
                t, c, w = line.split(":")[0].strip(), line.split(":")[1].strip(), ':'.join(line.split(":")[2:]).strip()
                
                i = char2idx[c]
                curr_feat = np.zeros((1,num_chars))
                curr_feat[0][i] = 1
                char_feats = np.append(char_feats, curr_feat, axis=0)
                
                words.append(w)
                word_timestamps.append(t)            
    word_timestamps = np.array(word_timestamps)
    return char_feats, words, word_timestamps

In [15]:
timed_transcript_with_characters_folder = 'movies_transcripts_with_characters/{}/'.format(stimulus)
num_segments = 12

In [16]:
char_feats, words, word_timestamps = generate_word_level_character_features(timed_transcript_with_characters_folder, num_segments, char2idx)

In [22]:
np.save('movies_word_level_character_features/hidden_figures_character_feats.npy', char_feats, allow_pickle=True)