In [1]:
from utils.functions import get_files, parse_script, extract_character_names, count_interactions, count_interactions_df
import re
import networkx as nx
from pyvis import network as net

In [2]:
# First is to download the scripts from the internet
# Disclaimer: I don't own any of these contents, they are available online with puplic access
# URLs for downloading the PDFs
url_list = {
    '2014_script': 'https://assets.scriptslug.com/live/pdf/scripts/interstellar-2014.pdf',
    '2008_script': 'https://s3-us-west-2.amazonaws.com/screenplays-pdf/Movie-Screenplay-PDFs/Interstellar-Jonathan-Nolan.pdf'
}

get_files(url_list)

Downloaded: data/2014_script.pdf
Converted data/2014_script.pdf to data/2014_script.txt
Downloaded: data/2008_script.pdf
Converted data/2008_script.pdf to data/2008_script.txt


In [3]:
with open('data/2014_script.txt', 'r') as file:
    script_2014 = file.read()

with open('data/2008_script.txt', 'r') as file:
    script_2008 = file.read()

# Extract names
names_2014 = extract_character_names(script_2014)
names_2008 = extract_character_names(script_2008)

print("Unique names in 2014 script:", names_2014)
print("Unique names in 2008 script:", names_2008)

unchanged_character = [i for i in names_2008 if i in names_2014]
print("Unchanged charcters:", unchanged_character)

Unique names in 2014 script: ['ADMINISTRATOR', 'BOOTS', 'BRAND', 'CASE', 'CB OPERATOR', 'COMPUTER VOICE', 'COOP', 'COOPER', 'DOCTOR', 'DONALD', 'DOYLE', 'DR MANN', 'ELDERLY FEMALE VOICE', 'ELDERLY WOMAN', 'GETTY', 'I N T E R S T E L L A R', 'INTERSTELLAR', 'KIPP', 'LOIS', 'MACHINE', 'MS HANLEY', 'MURPH', 'NURSE', 'OLD MAN', 'OMITTED', 'PILOT', 'PRINCIPAL', 'PROFESSOR BRAND', 'RADIO', 'ROMILLY', 'TARS', 'TOM', 'VOICE', 'WILLIAMS', 'WOMAN']
Unique names in 2008 script: ['ADMINISTRATOR', 'ANSEN', 'ASSISTANT', 'BALLPLAYER', 'BLACK', 'BRAND', "BRAND'S FATHER", 'CASE', 'CHINESE OFFICER', 'COOPER', 'DOCTOR', 'DONALD', 'DOYLE', 'EMILY COOPER', 'END', 'ENGINEER ROBOT', 'FADE TO BLACK', 'FARMER', 'GOVERNMENT MAN', 'LIU', 'MURPH', "MURPH'S WIFE", 'NSA AGENT', 'OLD ENGINEER', 'OLD MAN', 'PRINCIPAL', 'RIGGS', 'ROBOT', 'ROTH', 'TARS', 'TOM', 'WIFE', 'WORLD FAMOUS NEW YORK YANKEES']
Unchanged charcters: ['ADMINISTRATOR', 'BRAND', 'CASE', 'COOPER', 'DOCTOR', 'DONALD', 'DOYLE', 'MURPH', 'OLD MAN', 'PRI

The filter is not perfect, but it is simple enough to serve its purpose. I will remove non-character names from the analysis.

The format of the script follows a repeated pattern, such as "INT COOPER MURPH COOPER... EXT BRAND COOPER... INT." This pattern is straightforward enough to count interactions between each character. The method is to examine all the names that appear sequentially within a single scene, categorizing them as direct interactions. Names that appear with one name in between are counted as indirect interactions.

This approach is not 100% perfect, as it only count dialogue rather than actions. However, these are in the minority (This is not an action movie!), as almost all scenes with at least two characters contain dialogue.

In [4]:
# First, let's examine the 2014 script
with open('data/2014_script.txt', 'r') as file:
    script_2014 = file.read()

# Parse the script
parsed_script_14 = parse_script(script_2014)

# List of names to remove
to_remove = [
    'I N T E R S T E L L A R', 'INTERSTELLAR', 'VOICE', 
    'COMPUTER VOICE', 'CB OPERATOR', 'OMITTED', 
    'ELDERLY FEMALE VOICE', 'ELDERLY WOMAN'
]
parsed_script_14 = [item for item in parsed_script_14 if item not in to_remove]

# Replacing alias, well, you need to watch the movie to know though...
alias_mapping = {
    'PILOT': 'COOPER',
    'MACHINE': 'TARS',
    'OLD MAN': 'PROFESSOR BRAND',
    'RADIO': 'ATC'
}
parsed_script_14 = [alias_mapping.get(name, name) for name in parsed_script_14]

# Sometimes, there are characters who appear as WOMAN or MAN (NAME) because their names haven’t been revealed yet. 
# I’ll handle these by going through the script line by line, starting from the end, 
# marking the line where their name is first introduced, and then replacing the alias above with their actual name.
woman_names = []
processed_script = []
for line in reversed(parsed_script_14):
    match = re.match(r'^\s*[\w\s]+?\s*\((\w+)\)\s*$', line)  # Match any format "CHARACTER (name)"
    if match:
        name = match.group(1)  # Extract only the name inside parentheses
        
        if "WOMAN" in line:
            processed_script.append(name)  # Add the name to processed script
            woman_names.append(name)       # Store name in woman_names list
        else:
            processed_script.append(name)  # For other characters, replace with the name
    elif line == "WOMAN":
        # If it's just WOMAN, replace it with the last known name in woman_names
        if woman_names:
            processed_script.append(woman_names[-1])  # Use the last known name
        else:
            processed_script.append(line)  # If no names, keep it as WOMAN
    else:
        processed_script.append(line) 

# Reverse the parsed script back
parsed_script_14 = list(reversed(processed_script))
my_list = ['apple\n', 'banana\n', 'cherry\n', 'date\n']

# Open a text file in write mode
with open('data/parsed_script_14.txt', 'w') as file:
    for item in parsed_script_14:
        file.write(item + '\n')

In [5]:
# 2008 script
with open('data/2008_script.txt', 'r') as file:
    script_2008 = file.read()

# Parse the script
parsed_script_08 = parse_script(script_2008)

# List of names to remove
to_remove2 = ['BLACK', 'WORLD FAMOUS NEW YORK YANKEES', 'END', 'FADE TO BLACK']
parsed_script_08 = [item for item in parsed_script_08 if item not in to_remove2]

alias_mapping2 = {
    'ROBOT': 'TARS',
    "BRAND'S FATHER": 'PROFESSOR BRAND',
    'OLD ENGINEER': 'ASSISTANT'
}
parsed_script_08 = [alias_mapping2.get(name, name) for name in parsed_script_08]
parsed_script_08 = [re.sub(r'\s*\(.*?\)\s*', '', name).strip() for name in parsed_script_08 if name.strip()]

with open('data/parsed_script_08.txt', 'w') as file:
    for item in parsed_script_08:
        file.write(item + '\n')

In [6]:
# Count interactions 2014
interactions_2014 = count_interactions_df(parsed_script_14)
interactions_2014['total_interactions'] = interactions_2014['direct_interaction'] + interactions_2014['indirect_interaction']
interactions_2014.to_csv('data/2014_script_interactions.csv', index=False)

# Count interactions 2008
interactions_2008 = count_interactions_df(parsed_script_08)
interactions_2008['total_interactions'] = interactions_2008['direct_interaction'] + interactions_2008['indirect_interaction']
interactions_2008.to_csv('data/2008_script_interactions.csv', index=False)

In [7]:
# Set up networkx for 2 scripts
G1 = nx.from_pandas_edgelist(interactions_2014,
                             source = 'character1',
                             target = 'character2',
                             edge_attr = 'total_interactions',
                             create_using = nx.Graph()
                             )

G2 = nx.from_pandas_edgelist(interactions_2008,
                             source = 'character1',
                             target = 'character2',
                             edge_attr = 'total_interactions',
                             create_using = nx.Graph()
                             )

In [8]:
# Set up network for G1
net1 = net.Network(notebook=True, width='1200px', height='700px', cdn_resources='in_line', 
                   bgcolor="#222222", font_color="white")
net1.barnes_hut(gravity=-5500, central_gravity=0.2, spring_length=200, spring_strength=0.05)

# Add nodes
for node, degree in G1.degree():
    net1.add_node(node, size=degree * 2)

# Add edges
for u, v, data in G1.edges(data=True):
    total_interactions = data['total_interactions']
    thickness = total_interactions
    net1.add_edge(u, v, value=total_interactions, title=str(total_interactions), width=thickness)
    
net1.save_graph('output/interstellar_2014.html')


# Initialize Network for G2
net2 = net.Network(notebook=True, width='1200px', height='700px', cdn_resources='in_line', 
                   bgcolor="#222222", font_color="white")
net2.barnes_hut(gravity=-5500, central_gravity=0.2, spring_length=200, spring_strength=0.05)

# Add nodes
for node, degree in G2.degree():
    net2.add_node(node, size=degree * 2)

# Add edges
for u, v, data in G2.edges(data=True):
    total_interactions = data['total_interactions']
    thickness = total_interactions
    net2.add_edge(u, v, value=total_interactions, title=str(total_interactions), width=thickness)

net2.save_graph('output/interstellar_2008.html')